Skip to content

Commit dca9b71

Browse files
author
Your Name
committed
caption performance improvement
1 parent 9185215 commit dca9b71

File tree

1 file changed

+16
-7
lines changed

1 file changed

+16
-7
lines changed

examples/droid/benchmark_captioning.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,23 @@ def process_single_trajectory_for_captioning(trajectory: Dict[str, Any], output_
148148
exterior_cameras = {}
149149

150150
for key in trajectory.keys():
151-
if "raw/images/" in key or "observation/images/" in key or "image" in key.lower():
151+
if "raw/images/" in key or "observation/images/" in key or ("image" in key.lower() and "intrinsics" not in key and "extrinsics" not in key):
152152
camera_keys.append(key)
153153
# Check for exterior cameras
154154
if "exterior" in key or "ext" in key:
155-
if "1" in key or "image_1" in key:
156-
exterior_cameras["exterior_1"] = key
157-
elif "2" in key or "image_2" in key:
158-
exterior_cameras["exterior_2"] = key
155+
# Prioritize specific image data keys
156+
if ("exterior_image_1" in key or "exterior_1" in key) and "intrinsics" not in key and "extrinsics" not in key:
157+
# Prefer raw/images over tfds keys for full resolution
158+
if "raw/images/exterior_image_1" in key:
159+
exterior_cameras["exterior_1"] = key
160+
elif "tfds/observation/exterior_image_1" in key and "exterior_1" not in exterior_cameras:
161+
exterior_cameras["exterior_1"] = key
162+
elif ("exterior_image_2" in key or "exterior_2" in key) and "intrinsics" not in key and "extrinsics" not in key:
163+
if "raw/images/exterior_image_2" in key:
164+
exterior_cameras["exterior_2"] = key
165+
elif "tfds/observation/exterior_image_2" in key and "exterior_2" not in exterior_cameras:
166+
exterior_cameras["exterior_2"] = key
167+
159168

160169
# If no exterior cameras found, skip
161170
if not exterior_cameras:
@@ -215,7 +224,7 @@ def process_single_trajectory_for_captioning(trajectory: Dict[str, Any], output_
215224
"These are 6 frames from a robot trajectory shown in temporal order "
216225
"(left to right, top to bottom). Please describe with one sentence what task the robot "
217226
"is performing in this trajectory. Be very specific about the "
218-
"actions and objects involved."
227+
"actions and objects involved. Such as Put the orange toy into the wooden box, Take the lid off the silver pot and put it on the table"
219228
)
220229

221230
vlm_caption = vlm_service.analyze_image(stitched_frame, vlm_prompt)
@@ -233,7 +242,7 @@ def process_single_trajectory_for_captioning(trajectory: Dict[str, Any], output_
233242
vlm_service = get_vlm_service()
234243
vlm_service.initialize()
235244

236-
comparison_prompt = f"""Compare these two robot task descriptions and determine if they describe the same or similar task:
245+
comparison_prompt = f"""Compare these one of the robot task descriptions of Groundtruth to VLM Caption and determine if they describe relevant task:
237246
238247
Description 1 (Ground Truth): {ground_truth}
239248

0 commit comments

Comments
 (0)