1+ import cv2
2+ import numpy as np
3+
4+
5+ def fill_small_holes (depth_img : np .ndarray , area_thresh : int ) -> np .ndarray :
6+ """
7+ Identifies regions in the depth image that have a value of 0 and fills them in
8+ with 1 if the region is smaller than a given area threshold.
9+
10+ Args:
11+ depth_img (np.ndarray): The input depth image
12+ area_thresh (int): The area threshold for filling in holes
13+
14+ Returns:
15+ filled_depth_img (np.ndarray): The depth image with small holes filled in
16+ """
17+ # Create a binary image where holes are 1 and the rest is 0
18+ binary_img = np .where (depth_img == 0 , 1 , 0 ).astype ("uint8" )
19+
20+ # Find contours in the binary image
21+ contours , _ = cv2 .findContours (binary_img , cv2 .RETR_TREE , cv2 .CHAIN_APPROX_SIMPLE )
22+
23+ filled_holes = np .zeros_like (binary_img )
24+
25+ for cnt in contours :
26+ # If the area of the contour is smaller than the threshold
27+ if cv2 .contourArea (cnt ) < area_thresh :
28+ # Fill the contour
29+ cv2 .drawContours (filled_holes , [cnt ], 0 , 1 , - 1 )
30+
31+ # Create the filled depth image
32+ filled_depth_img = np .where (filled_holes == 1 , 1 , depth_img )
33+
34+ return filled_depth_img
35+
36+
37+ class MP3DGTPerception :
38+ """
39+ Ground-truth perception utility for projecting MP3D object 3D bounding boxes
40+ into the current camera view to produce per-target semantic masks.
41+
42+ Args:
43+ max_depth (float): Maximum metric depth (used for depth rescaling and masking).
44+ min_depth (float): Minimum metric depth (used for depth rescaling).
45+ fx (float): Camera focal length in pixels along x.
46+ fy (float): Camera focal length in pixels along y.
47+ """
48+
49+ def __init__ (self , max_depth , min_depth , fx , fy ):
50+ self .max_depth = max_depth
51+ self .min_depth = min_depth
52+ self .fx = fx
53+ self .fy = fy
54+
55+ def predict (self , depth , targets , tf_camera_to_ply , area_threshold = 2500 ):
56+ """
57+ Get ground-truth semantic masks for target objects by projecting 3D bboxes into the image.
58+
59+ Args:
60+ depth (np.ndarray): Depth image of shape (H, W). Values are assumed to be normalized to [0, 1] and will be rescaled to metric depth using ``depth * (max_depth - min_depth) + min_depth``.
61+ targets (np.ndarray): Target 3D axis-aligned bounding boxes of shape (N, 6), formatted as ``[min_x, min_y, min_z, max_x, max_y, max_z]`` in the PLY/world frame.
62+ tf_camera_to_ply (np.ndarray): Homogeneous 4x4 transform from camera frame to the PLY/world frame.
63+ area_threshold (int): Area threshold used by the hole-filling routine for both the depth map and the output masks.
64+
65+ Returns:
66+ semantic_images (np.ndarray): Binary semantic masks of shape (N, H, W) with dtype ``np.uint8`` where 1 indicates pixels belonging to the corresponding target and 0 otherwise. If no targets are provided, returns an all-zero array of shape (1, H, W).
67+ """
68+ # get the point clouds of current frame
69+ filled_depth = fill_small_holes (depth , area_threshold )
70+ scaled_depth = filled_depth * (self .max_depth - self .min_depth ) + self .min_depth
71+ mask = scaled_depth < self .max_depth
72+ point_cloud_camera_frame = get_point_cloud (scaled_depth , mask , self .fx , self .fy )
73+ point_cloud_ply_frame = transform_points (tf_camera_to_ply , point_cloud_camera_frame )
74+
75+ # mark the points in the target objects' bboxes
76+ semantic_images = []
77+ for target in targets :
78+ min_x , min_y , min_z = target [:3 ]
79+ max_x , max_y , max_z = target [3 :]
80+
81+ in_bbox = (
82+ (point_cloud_ply_frame [:, 0 ] >= min_x )
83+ & (point_cloud_ply_frame [:, 0 ] <= max_x )
84+ & (point_cloud_ply_frame [:, 1 ] >= min_y )
85+ & (point_cloud_ply_frame [:, 1 ] <= max_y )
86+ & (point_cloud_ply_frame [:, 2 ] >= min_z )
87+ & (point_cloud_ply_frame [:, 2 ] <= max_z )
88+ )
89+ in_bbox_points = point_cloud_ply_frame [in_bbox ]
90+ semantic_image = np .zeros (depth .shape , dtype = np .uint8 )
91+ if len (in_bbox_points ) > 0 :
92+ # map the marked points back to the image to get the semantic map
93+ in_bbox_camera_frame = inverse_transform_points (tf_camera_to_ply , in_bbox_points )
94+ in_box_image_coords = project_points_to_image (in_bbox_camera_frame , self .fx , self .fy , depth .shape )
95+ try :
96+ mask = [
97+ in_box_image_coords [i , 0 ] < 480 and in_box_image_coords [i , 1 ] < 640
98+ for i in range (len (in_box_image_coords ))
99+ ]
100+ in_box_image_coords = in_box_image_coords [mask ]
101+ semantic_image [in_box_image_coords [:, 0 ], in_box_image_coords [:, 1 ]] = 1
102+ except Exception as e :
103+ print (e )
104+ semantic_image = fill_small_holes (semantic_image , area_threshold )
105+ semantic_images .append (semantic_image )
106+ if len (semantic_images ) > 0 :
107+ semantic_images = np .stack (semantic_images , axis = 0 )
108+ else :
109+ semantic_images = np .zeros ((1 , depth .shape [0 ], depth .shape [1 ]), dtype = np .uint8 )
110+ return semantic_images
111+
112+
113+ def transform_points (transformation_matrix : np .ndarray , points : np .ndarray ) -> np .ndarray :
114+ # Add a homogeneous coordinate of 1 to each point for matrix multiplication
115+ homogeneous_points = np .hstack ((points , np .ones ((points .shape [0 ], 1 ))))
116+
117+ # Apply the transformation matrix to the points
118+ transformed_points = np .dot (transformation_matrix , homogeneous_points .T ).T
119+
120+ # Remove the added homogeneous coordinate and divide by the last coordinate
121+ return transformed_points [:, :3 ] / transformed_points [:, 3 :]
122+
123+
124+ def get_point_cloud (depth_image : np .ndarray , mask : np .ndarray , fx : float , fy : float ) -> np .ndarray :
125+ """Calculates the 3D coordinates (x, y, z) of points in the depth image based on
126+ the horizontal field of view (HFOV), the image width and height, the depth values,
127+ and the pixel x and y coordinates.
128+
129+ Args:
130+ depth_image (np.ndarray): 2D depth image.
131+ mask (np.ndarray): 2D binary mask identifying relevant pixels.
132+ fx (float): Focal length in the x direction.
133+ fy (float): Focal length in the y direction.
134+
135+ Returns:
136+ cloud (np.ndarray): Array of 3D coordinates (x, y, z) of the points in the image plane.
137+ """
138+ v , u = np .where (mask )
139+ z = depth_image [v , u ]
140+ x = (u - depth_image .shape [1 ] // 2 ) * z / fx
141+ y = (v - depth_image .shape [0 ] // 2 ) * z / fy
142+ cloud = np .stack ((x , - y , - z ), axis = - 1 )
143+
144+ return cloud
145+
146+
147+ def inverse_transform_points (transformation_matrix : np .ndarray , points : np .ndarray ) -> np .ndarray :
148+ """Convert point cloud from episodic coordinate system to camera coordinate system
149+
150+ Args:
151+ transformation_matrix (np.ndarray): 4x4 transformation matrix
152+ points (np.ndarray): Point cloud coordinates (N, 3)
153+
154+ Returns:
155+ result_points (np.ndarray): Point cloud coordinates in camera coordinate system (N, 3)
156+ """
157+ # Calculate the inverse of the transformation matrix
158+ inv_matrix = np .linalg .inv (transformation_matrix )
159+
160+ # Add a homogeneous coordinate of 1 to each point for matrix multiplication
161+ homogeneous_points = np .hstack ((points , np .ones ((points .shape [0 ], 1 ))))
162+
163+ # Apply the inverse transformation
164+ transformed_points = np .dot (inv_matrix , homogeneous_points .T ).T
165+
166+ # Remove the added homogeneous coordinate
167+ result_points = transformed_points [:, :3 ] / transformed_points [:, 3 :]
168+ return result_points
169+
170+
171+ def project_points_to_image (points : np .ndarray , fx : float , fy : float , image_shape : tuple ) -> np .ndarray :
172+ """Project points from camera coordinate system to image plane
173+
174+ Args:
175+ points (np.ndarray): Points in camera coordinate system (N, 3)
176+ fx (float): x-axis focal length
177+ fy (float): y-axis focal length
178+ image_shape (tuple): Image dimensions (height, width)
179+
180+ Returns:
181+ result_points (np.ndarray): Image coordinates (N, 2)
182+ """
183+ points = np .stack ((points [:, 0 ], - points [:, 1 ], - points [:, 2 ]), axis = - 1 )
184+ # Ensure points are in front of the camera
185+ valid_mask = points [:, 2 ] > 0 # z > 0
186+
187+ # Calculate image coordinates
188+ u = points [:, 0 ] * fx / points [:, 2 ] + image_shape [1 ] // 2
189+ v = points [:, 1 ] * fy / points [:, 2 ] + image_shape [0 ] // 2
190+
191+ # Combine coordinates
192+ image_coords = np .stack ((v , u ), axis = - 1 )
193+ image_coords = image_coords .astype (np .int32 )
194+ # Return valid points only
195+ result_points = image_coords [valid_mask ]
196+ return result_points
0 commit comments