Skip to content

Commit 57a14f6

Browse files
author
Your Name
committed
update camera views
1 parent 4e90dcb commit 57a14f6

File tree

6 files changed

+178
-39
lines changed

6 files changed

+178
-39
lines changed

examples/droid/.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
droid_data/
22
robodm_trajectories/
3-
vlm_analysis_results/
3+
vlm_analysis_results/
4+
full_robodm_trajectories/
5+
f1_matrix_results/

examples/droid/droid_to_robodm.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from robodm import Trajectory
1515

1616

17-
@ray.remote
17+
@ray.remote(num_cpus=4)
1818
def download_and_convert_trajectory(trajectory_path: str, output_dir: str, temp_dir: str) -> Tuple[bool, str, str]:
1919
"""
2020
Download and convert a single DROID trajectory to RoboDM format.
@@ -109,6 +109,29 @@ def load_mp4_frames(self, mp4_path: str) -> np.ndarray:
109109
cap.release()
110110
return np.array(frames)
111111

112+
def split_stereo_frames(self, stereo_frames: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
113+
"""
114+
Split side-by-side stereo frames into separate left and right frame arrays.
115+
116+
Args:
117+
stereo_frames: Array of stereo frames with shape (num_frames, height, width, channels)
118+
where width contains both left and right images side-by-side
119+
120+
Returns:
121+
Tuple of (left_frames, right_frames), each with shape (num_frames, height, width/2, channels)
122+
"""
123+
if len(stereo_frames) == 0:
124+
return np.array([]), np.array([])
125+
126+
num_frames, height, width, channels = stereo_frames.shape
127+
half_width = width // 2
128+
129+
# Split each frame horizontally
130+
left_frames = stereo_frames[:, :, :half_width, :]
131+
right_frames = stereo_frames[:, :, half_width:, :]
132+
133+
return left_frames, right_frames
134+
112135
def load_droid_trajectory(self, droid_path: str) -> Dict:
113136
"""
114137
Load a DROID trajectory from downloaded files.
@@ -192,12 +215,13 @@ def load_droid_trajectory(self, droid_path: str) -> Dict:
192215
stereo_filename = os.path.basename(metadata[mp4_key]).replace(".mp4", "-stereo.mp4")
193216
stereo_path = os.path.join(droid_path, "recordings", "MP4", stereo_filename)
194217
if os.path.exists(stereo_path):
195-
images = self.load_mp4_frames(stereo_path)
196-
if len(images) > 0:
197-
# For stereo, use right camera name
198-
right_cam_name = cam_name.replace("left", "right")
199-
trajectory_data["images"][right_cam_name] = images
200-
print(f" Loaded {right_cam_name}: shape {images.shape}")
218+
stereo_images = self.load_mp4_frames(stereo_path)
219+
if len(stereo_images) > 0:
220+
left_images, right_images = self.split_stereo_frames(stereo_images)
221+
trajectory_data["images"][cam_name] = left_images
222+
trajectory_data["images"][cam_name.replace("left", "right")] = right_images
223+
print(f" Loaded {cam_name}: shape {left_images.shape}")
224+
print(f" Loaded {cam_name.replace('left', 'right')}: shape {right_images.shape}")
201225

202226
return trajectory_data
203227

@@ -531,8 +555,8 @@ def convert_single_trajectory(traj_dir: str, output_dir: str) -> Tuple[bool, str
531555
print("Starting parallel download and conversion...")
532556
successful_paths = processor.download_sample_trajectories(
533557
output_dir=output_dir,
534-
num_success=300,
535-
num_failure=100
558+
num_success=50,
559+
num_failure=50
536560
)
537561

538562
print(f"\nSuccessfully processed {len(successful_paths)} trajectories:")

examples/droid/droid_vlm_demo.py

Lines changed: 134 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,13 @@
99
5. Shows how VLM tools can be used during filtering
1010
"""
1111

12-
# python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0 --port 30000
12+
# python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-32B-Instruct --host 0.0.0.0 --port 30000
1313

1414
import os
1515
import time
16+
import argparse
1617
from pathlib import Path
17-
from typing import Dict, List, Any
18+
from typing import Dict, List, Any, Optional
1819

1920
import numpy as np
2021
import cv2
@@ -30,15 +31,23 @@
3031
class DROIDSuccessDetector:
3132
"""Enhanced DROID success/failure detector using RoboDM Agent system."""
3233

33-
def __init__(self):
34-
"""Initialize the detector with Agent capabilities."""
34+
def __init__(self, max_trajectories: Optional[int] = None):
35+
"""Initialize the detector with Agent capabilities.
36+
37+
Args:
38+
max_trajectories: Maximum number of trajectories to process. If None, processes all trajectories.
39+
"""
3540
print("Initializing RoboDM Agent with VLM tools...")
3641

42+
self.max_trajectories = max_trajectories
43+
if max_trajectories is not None:
44+
print(f"Will limit processing to maximum {max_trajectories} trajectories")
45+
3746
# Configure tools for the Agent
3847
self.tools_config = {
3948
"tools": {
4049
"robo2vlm": {
41-
"model": "Qwen/Qwen2.5-VL-7B-Instruct",
50+
"model": "Qwen/Qwen2.5-VL-32B-Instruct",
4251
"temperature": 0.1,
4352
"max_tokens": 4096,
4453
"context_length": 1024
@@ -85,7 +94,47 @@ def create_robodm_dataset(self, robodm_dir: str) -> VLADataset:
8594
config=config
8695
)
8796

88-
print(f"Created VLADataset with {dataset.count()} trajectory files")
97+
total_trajectories = dataset.count()
98+
print(f"Found {total_trajectories} trajectory files")
99+
100+
# Apply max_trajectories limit if specified
101+
if self.max_trajectories is not None and total_trajectories > self.max_trajectories:
102+
print(f"Limiting to {self.max_trajectories} trajectories (out of {total_trajectories} total)")
103+
# Use take() to limit the number of trajectories
104+
limited_items = dataset.take(self.max_trajectories)
105+
106+
# Create a new VLADataset from the limited items
107+
# We need to extract file paths from the limited items
108+
if limited_items:
109+
# Extract file paths from the limited items
110+
# The items are currently just string paths from the Ray dataset
111+
limited_file_paths = [item if isinstance(item, str) else item.get("item", str(item))
112+
for item in limited_items]
113+
114+
# Create a new VLADataset with limited file paths
115+
import ray.data as rd
116+
limited_ray_dataset = rd.from_items(limited_file_paths)
117+
if config.shuffle:
118+
limited_ray_dataset = limited_ray_dataset.random_shuffle()
119+
120+
# Create new VLADataset instance with limited data
121+
limited_dataset = VLADataset.__new__(VLADataset)
122+
limited_dataset.path = dataset.path
123+
limited_dataset.return_type = dataset.return_type
124+
limited_dataset.config = dataset.config
125+
limited_dataset.file_paths = limited_file_paths
126+
limited_dataset.ray_dataset = limited_ray_dataset
127+
limited_dataset.metadata_manager = dataset.metadata_manager
128+
limited_dataset._schema = None
129+
limited_dataset._stats = None
130+
limited_dataset._is_loaded = False
131+
limited_dataset._has_file_paths = True
132+
133+
dataset = limited_dataset
134+
print(f"Limited dataset created with {dataset.count()} trajectory files")
135+
else:
136+
print(f"Processing all {total_trajectories} trajectory files")
137+
89138
print(f"Dataset type: {type(dataset)}")
90139
print(f"Has _is_loaded: {hasattr(dataset, '_is_loaded')}")
91140
print(f"Is loaded: {dataset._is_loaded}")
@@ -227,25 +276,35 @@ def calculate_f1_matrix(self, dataset: VLADataset):
227276
print("F1 MATRIX CALCULATION")
228277
print("=" * 60)
229278

279+
# Create output directory for F1 matrix results
280+
f1_output_dir = Path("./f1_matrix_results")
281+
f1_output_dir.mkdir(exist_ok=True)
282+
230283
# Transform to extract labels and predictions
231284
def extract_labels_and_predictions(trajectory: Dict[str, Any]) -> Dict[str, Any]:
232-
"""Extract ground truth and VLM predictions for F1 calculation."""
285+
"""Extract ground truth and VLM predictions for F1 calculation with file saving."""
233286
from pathlib import Path
234287
import numpy as np
288+
import cv2
235289

236290
file_path = trajectory.get("__file_path__", "")
237291
ground_truth = "success" in file_path.lower()
292+
traj_name = Path(file_path).stem
238293

239-
# Get VLM prediction (simplified version without saving files)
294+
# Get VLM prediction and save all results
240295
vlm_prediction = False
296+
vlm_response = "No VLM analysis performed"
297+
241298
try:
242299
# Find camera keys
243300
camera_keys = [k for k in trajectory.keys()
244301
if "observation/images/" in k or "image" in k.lower()]
302+
print(f"Camera keys: {camera_keys}")
245303

246304
if camera_keys:
247305
primary_camera = camera_keys[3] if len(camera_keys) > 1 else camera_keys[0]
248306
frames = trajectory.get(primary_camera, [])
307+
print(f"Frames: {len(frames)}, {frames[0].shape}")
249308

250309
if len(frames) >= 4:
251310
# Select 4 frames: start, 1/3, 2/3, and end
@@ -257,32 +316,71 @@ def extract_labels_and_predictions(trajectory: Dict[str, Any]) -> Dict[str, Any]
257316
resized_frames = []
258317
for frame in selected_frames:
259318
if frame.shape[:2] != (h, w):
260-
import cv2
261319
frame = cv2.resize(frame, (w, h))
262320
resized_frames.append(frame)
263321

264322
top_row = np.hstack([resized_frames[0], resized_frames[1]])
265323
bottom_row = np.hstack([resized_frames[2], resized_frames[3]])
266324
stitched_frame = np.vstack([top_row, bottom_row])
267325

326+
# Save input image
327+
image_filename = f1_output_dir / f"{traj_name}_input.jpg"
328+
cv2.imwrite(str(image_filename), cv2.cvtColor(stitched_frame, cv2.COLOR_RGB2BGR))
329+
268330
# Use VLM to get prediction
269331
from robodm.agent.vlm_service import get_vlm_service
270332
vlm_service = get_vlm_service()
271333
vlm_service.initialize()
272334

273-
vlm_prompt = "These are 4 frames from a robot trajectory. Does this trajectory look successful? Answer yes or no."
335+
vlm_prompt = "These are 4 frames from a robot trajectory. Does this trajectory look successful? First answer yes or no, then explain why."
274336
vlm_response = vlm_service.analyze_image(stitched_frame, vlm_prompt)
275-
print(vlm_response)
276337
vlm_prediction = "yes" in vlm_response.lower()
277338

339+
print(f"🔍 F1 Analysis for {traj_name}: GT={ground_truth}, VLM={vlm_prediction}")
340+
341+
elif len(frames) > 0:
342+
# If fewer than 4 frames, just use the last frame
343+
stitched_frame = frames[-1]
344+
345+
# Save input image
346+
image_filename = f1_output_dir / f"{traj_name}_input.jpg"
347+
cv2.imwrite(str(image_filename), cv2.cvtColor(stitched_frame, cv2.COLOR_RGB2BGR))
348+
349+
# Use VLM to get prediction
350+
from robodm.agent.vlm_service import get_vlm_service
351+
vlm_service = get_vlm_service()
352+
vlm_service.initialize()
353+
354+
vlm_prompt = "This is the final frame from a robot trajectory. Does this trajectory look successful? Answer yes or no."
355+
vlm_response = vlm_service.analyze_image(stitched_frame, vlm_prompt)
356+
vlm_prediction = "yes" in vlm_response.lower()
357+
358+
print(f"🔍 F1 Analysis for {traj_name}: GT={ground_truth}, VLM={vlm_prediction}")
359+
278360
except Exception as e:
279-
print(f"Error in VLM prediction: {e}")
280-
vlm_prediction = ground_truth # fallback to ground truth
361+
print(f"Error in VLM prediction for {traj_name}: {e}")
362+
vlm_prediction = ground_truth
363+
vlm_response = f"Error occurred: {str(e)}"
364+
365+
# Save results to file
366+
results_filename = f1_output_dir / f"{traj_name}_results.txt"
367+
with open(results_filename, 'w') as f:
368+
f.write(f"F1 Matrix Calculation Results\n")
369+
f.write(f"=============================\n")
370+
f.write(f"Trajectory: {traj_name}\n")
371+
f.write(f"File path: {file_path}\n")
372+
f.write(f"Ground truth (success): {ground_truth}\n")
373+
f.write(f"VLM prediction (success): {vlm_prediction}\n")
374+
f.write(f"Prediction correct: {ground_truth == vlm_prediction}\n")
375+
f.write(f"\nVLM Prompt:\n{vlm_prompt if 'vlm_prompt' in locals() else 'No prompt used'}\n")
376+
f.write(f"\nVLM Response:\n{vlm_response}\n")
377+
f.write(f"\nInput image saved as: {traj_name}_input.jpg\n")
281378

282379
return {
283-
"trajectory_name": Path(file_path).stem,
380+
"trajectory_name": traj_name,
284381
"ground_truth": ground_truth,
285-
"vlm_prediction": vlm_prediction
382+
"vlm_prediction": vlm_prediction,
383+
"vlm_response": vlm_response
286384
}
287385

288386
# Apply transformation to get all predictions using VLADataset's map
@@ -315,6 +413,12 @@ def extract_labels_and_predictions(trajectory: Dict[str, Any]) -> Dict[str, Any]
315413
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
316414
accuracy = (true_positives + true_negatives) / len(results)
317415

416+
print(f"\nDetailed Results:")
417+
for result in results:
418+
status = "✅" if result["ground_truth"] == result["vlm_prediction"] else "❌"
419+
print(f"{status} {result['trajectory_name']}: GT={result['ground_truth']}, Pred={result['vlm_prediction']}")
420+
421+
318422
# Print F1 Matrix
319423
print("\nConfusion Matrix:")
320424
print(" Predicted")
@@ -328,10 +432,7 @@ def extract_labels_and_predictions(trajectory: Dict[str, Any]) -> Dict[str, Any]
328432
print(f"Recall: {recall:.3f}")
329433
print(f"F1 Score: {f1_score:.3f}")
330434

331-
print(f"\nDetailed Results:")
332-
for result in results:
333-
status = "✅" if result["ground_truth"] == result["vlm_prediction"] else "❌"
334-
print(f"{status} {result['trajectory_name']}: GT={result['ground_truth']}, Pred={result['vlm_prediction']}")
435+
335436

336437
return f1_score
337438

@@ -341,10 +442,22 @@ def main():
341442
print("RoboDM VLADataset and Agent Demo")
342443
print("=" * 60)
343444

344-
robodm_dir = "./robodm_trajectories"
445+
# Configuration
446+
parser = argparse.ArgumentParser(description="Run the DROID VLM demo")
447+
parser.add_argument("--data_dir", type=str, default="./robodm_trajectories", help="Directory containing RoboDM trajectory files")
448+
parser.add_argument("--max_trajectories", type=int, default=100, help="Maximum number of trajectories to process")
449+
args = parser.parse_args()
450+
451+
robodm_dir = args.data_dir
452+
max_trajectories = args.max_trajectories
453+
454+
print(f"Configuration:")
455+
print(f" Data directory: {robodm_dir}")
456+
print(f" Max trajectories: {max_trajectories if max_trajectories is not None else 'All'}")
457+
345458
# Step 3: Create VLADataset (with file paths only)
346459
print("\n3. Creating VLADataset...")
347-
detector = DROIDSuccessDetector()
460+
detector = DROIDSuccessDetector(max_trajectories=max_trajectories)
348461
dataset = detector.create_robodm_dataset(robodm_dir)
349462

350463
# Step 5: Calculate F1 Matrix

robodm/agent/planner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@ class Planner:
2525
Dynamically adapts to dataset schema.
2626
"""
2727

28-
def __init__(self, llm_model: str = "Qwen/Qwen2.5-VL-7B-Instruct", tools_manager=None, **llm_kwargs):
28+
def __init__(self, llm_model: str = "Qwen/Qwen2.5-VL-32B-Instruct", tools_manager=None, **llm_kwargs):
2929
"""
3030
Initialize Planner with shared VLM service.
3131
3232
Args:
33-
llm_model: Model name for code generation (default: Qwen/Qwen2.5-VL-7B-Instruct)
33+
llm_model: Model name for code generation (default: Qwen/Qwen2.5-VL-32B-Instruct)
3434
tools_manager: ToolsManager instance for accessing tools
3535
**llm_kwargs: Additional arguments for VLM service initialization
3636
"""

robodm/agent/tools/implementations.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ class VisionLanguageModel:
5252
"""Vision-language model for analyzing images using shared VLM service."""
5353

5454
def __init__(self,
55-
model: str = "Qwen/Qwen2.5-VL-7B-Instruct",
55+
model: str = "Qwen/Qwen2.5-VL-32B-Instruct",
5656
temperature: float = 0.1,
5757
max_tokens: int = 256,
5858
trust_remote_code: bool = True,
@@ -298,7 +298,7 @@ class VisionLanguageModelTool(BaseTool):
298298

299299
def __init__(
300300
self,
301-
model: str = "Qwen/Qwen2.5-VL-7B-Instruct",
301+
model: str = "Qwen/Qwen2.5-VL-32B-Instruct",
302302
temperature: float = 0.1,
303303
max_tokens: int = 256,
304304
**kwargs,
@@ -349,7 +349,7 @@ def get_metadata(cls) -> ToolMetadata:
349349
],
350350
tags=["vision", "language", "analysis", "robotic"],
351351
parameters={
352-
"model": "Qwen/Qwen2.5-VL-7B-Instruct",
352+
"model": "Qwen/Qwen2.5-VL-32B-Instruct",
353353
"temperature": 0.1,
354354
"max_tokens": 256
355355
},
@@ -384,7 +384,7 @@ def reconfigure(self, **kwargs):
384384

385385
# Reinitialize shared VLM service with new config
386386
self.vlm_service.initialize(
387-
model=self.config.get("model", "Qwen/Qwen2.5-VL-7B-Instruct"),
387+
model=self.config.get("model", "Qwen/Qwen2.5-VL-32B-Instruct"),
388388
temperature=self.config.get("temperature", 0.1),
389389
max_tokens=self.config.get("max_tokens", 256),
390390
trust_remote_code=self.config.get("trust_remote_code", True),
@@ -394,7 +394,7 @@ def reconfigure(self, **kwargs):
394394

395395
# Recreate VLM instance with new config
396396
self.vlm = VisionLanguageModel(
397-
model=self.config.get("model", "Qwen/Qwen2.5-VL-7B-Instruct"),
397+
model=self.config.get("model", "Qwen/Qwen2.5-VL-32B-Instruct"),
398398
temperature=self.config.get("temperature", 0.1),
399399
max_tokens=self.config.get("max_tokens", 256),
400400
trust_remote_code=self.config.get("trust_remote_code", True),

0 commit comments

Comments
 (0)