@@ -148,14 +148,23 @@ def process_single_trajectory_for_captioning(trajectory: Dict[str, Any], output_
148148 exterior_cameras = {}
149149
150150 for key in trajectory .keys ():
151- if "raw/images/" in key or "observation/images/" in key or "image" in key .lower ():
151+ if "raw/images/" in key or "observation/images/" in key or ( "image" in key .lower () and "intrinsics" not in key and "extrinsics" not in key ):
152152 camera_keys .append (key )
153153 # Check for exterior cameras
154154 if "exterior" in key or "ext" in key :
155- if "1" in key or "image_1" in key :
156- exterior_cameras ["exterior_1" ] = key
157- elif "2" in key or "image_2" in key :
158- exterior_cameras ["exterior_2" ] = key
155+ # Prioritize specific image data keys
156+ if ("exterior_image_1" in key or "exterior_1" in key ) and "intrinsics" not in key and "extrinsics" not in key :
157+ # Prefer raw/images over tfds keys for full resolution
158+ if "raw/images/exterior_image_1" in key :
159+ exterior_cameras ["exterior_1" ] = key
160+ elif "tfds/observation/exterior_image_1" in key and "exterior_1" not in exterior_cameras :
161+ exterior_cameras ["exterior_1" ] = key
162+ elif ("exterior_image_2" in key or "exterior_2" in key ) and "intrinsics" not in key and "extrinsics" not in key :
163+ if "raw/images/exterior_image_2" in key :
164+ exterior_cameras ["exterior_2" ] = key
165+ elif "tfds/observation/exterior_image_2" in key and "exterior_2" not in exterior_cameras :
166+ exterior_cameras ["exterior_2" ] = key
167+
159168
160169 # If no exterior cameras found, skip
161170 if not exterior_cameras :
@@ -215,7 +224,7 @@ def process_single_trajectory_for_captioning(trajectory: Dict[str, Any], output_
215224 "These are 6 frames from a robot trajectory shown in temporal order "
216225 "(left to right, top to bottom). Please describe with one sentence what task the robot "
217226 "is performing in this trajectory. Be very specific about the "
218- "actions and objects involved."
227+ "actions and objects involved. Such as Put the orange toy into the wooden box, Take the lid off the silver pot and put it on the table "
219228 )
220229
221230 vlm_caption = vlm_service .analyze_image (stitched_frame , vlm_prompt )
@@ -233,7 +242,7 @@ def process_single_trajectory_for_captioning(trajectory: Dict[str, Any], output_
233242 vlm_service = get_vlm_service ()
234243 vlm_service .initialize ()
235244
236- comparison_prompt = f"""Compare these two robot task descriptions and determine if they describe the same or similar task:
245+ comparison_prompt = f"""Compare these one of the robot task descriptions of Groundtruth to VLM Caption and determine if they describe relevant task:
237246
238247Description 1 (Ground Truth): { ground_truth }
239248
0 commit comments