diff --git a/mmdet3d/visualization/local_visualizer.py b/mmdet3d/visualization/local_visualizer.py
index d1eff4af97..aa9e484926 100644
--- a/mmdet3d/visualization/local_visualizer.py
+++ b/mmdet3d/visualization/local_visualizer.py
@@ -673,9 +673,14 @@ def _draw_instances_3d(self,
             img = data_input['img']
             if isinstance(img, list) or (isinstance(img, (np.ndarray, Tensor))
                                          and len(img.shape) == 4):
-                # show multi-view images
-                img_size = img[0].shape[:2] if isinstance(
-                    img, list) else img.shape[-2:]  # noqa: E501
+                # show multi-view images       
+                # when multi-view images are not in the same size, choose the max (h,w) as the img_size.
+                if isinstance(img, list):
+                    all_size_array = np.stack([im.shape[:2] for im in img], axis=0)
+                    img_size = tuple(np.max(all_size_array,axis=0))
+                else:
+                    img.shape[-2:]
+                
                 img_col = self.multi_imgs_col
                 img_row = math.ceil(len(img) / img_col)
                 composed_img = np.zeros(
@@ -688,6 +693,7 @@ def _draw_instances_3d(self,
                         single_img = single_img.permute(1, 2, 0).numpy()
                         single_img = single_img[..., [2, 1, 0]]  # bgr to rgb
                     self.set_image(single_img)
+                    single_img_size = single_img.shape[:2]
                     single_img_meta = dict()
                     for key, meta in input_meta.items():
                         if isinstance(meta,
@@ -714,10 +720,10 @@ def _draw_instances_3d(self,
                         centers_2d = instances.centers_2d
                         self.draw_points(centers_2d)
                     composed_img[(i // img_col) *
-                                 img_size[0]:(i // img_col + 1) * img_size[0],
+                                 img_size[0]:(i // img_col)*img_size[0] + single_img_size[0],
                                  (i % img_col) *
-                                 img_size[1]:(i % img_col + 1) *
-                                 img_size[1]] = self.get_image()
+                                 img_size[1]:(i % img_col)*img_size[1]+
+                                 single_img_size[1]] = self.get_image()
                 data_3d['img'] = composed_img
             else:
                 # show single-view image
diff --git a/projects/BEVFusion/bevfusion/loading.py b/projects/BEVFusion/bevfusion/loading.py
index 8615be7e3f..de15bedb84 100644
--- a/projects/BEVFusion/bevfusion/loading.py
+++ b/projects/BEVFusion/bevfusion/loading.py
@@ -147,7 +147,7 @@ def transform(self, results: dict) -> Optional[dict]:
 
             cam2img_array = np.eye(4).astype(np.float32)
             cam2img_array[:3, :3] = np.array(cam_item['cam2img']).astype(
-                np.float32)
+                np.float32)[:3,:3]
             cam2img.append(cam2img_array)
             lidar2img.append(cam2img_array @ lidar2cam_array)