From 1621dd82d7486fb679f24bc729ea9baffd26b64a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 15 Feb 2025 03:15:09 +0000 Subject: [PATCH 01/12] support for squeeze backbone with fpn neck --- ...erpoint_pillar02_squeeze_squeezefpn_nus.py | 91 +++++++ ...terpoint_voxel01_squeeze_squeezefpn_nus.py | 46 ++++ ...ueeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py | 253 ++++++++++++++++++ ...ueeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py | 160 +++++++++++ mmdet3d/models/backbones/__init__.py | 3 +- mmdet3d/models/backbones/squeezenet.py | 112 ++++++++ mmdet3d/models/layers/norm.py | 2 + mmdet3d/models/necks/__init__.py | 3 +- mmdet3d/models/necks/squeeze_fpn.py | 110 ++++++++ 9 files changed, 778 insertions(+), 2 deletions(-) create mode 100644 configs/_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py create mode 100644 configs/_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py create mode 100644 configs/centerpoint/centerpoint_pillar02_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py create mode 100644 configs/centerpoint/centerpoint_voxel01_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py create mode 100644 mmdet3d/models/backbones/squeezenet.py create mode 100644 mmdet3d/models/necks/squeeze_fpn.py diff --git a/configs/_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py b/configs/_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py new file mode 100644 index 000000000..04375a2a1 --- /dev/null +++ b/configs/_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py @@ -0,0 +1,91 @@ +voxel_size = [0.2, 0.2, 8] +model = dict( + type='CenterPoint', + data_preprocessor=dict( + type='Det3DDataPreprocessor', + voxel=True, + voxel_layer=dict( + max_num_points=20, + voxel_size=voxel_size, + max_voxels=(30000, 40000))), + pts_voxel_encoder=dict( + type='PillarFeatureNet', + in_channels=5, + feat_channels=[64], + with_distance=False, + voxel_size=(0.2, 0.2, 8), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), + legacy=False), + pts_middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)), + pts_backbone=dict( + type='SQUEEZE', + in_channels=64, + out_channels=[64, 128, 256 , 512], + #layer_nums=[3, 5, 5], + #layer_strides=[2, 2, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SQUEEZEFPN', + in_channels=[64, 128, 256 , 512], + out_channels=[512, 512, 512, 512], + #upsample_strides=[0.5, 1, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + #use_conv_for_no_stride=True + ), + pts_bbox_head=dict( + type='CenterHead', + in_channels=sum([128, 128, 128,128]), + #in_channels=256, + tasks=[ + dict(num_class=1, class_names=['car']), + dict(num_class=2, class_names=['truck', 'construction_vehicle']), + dict(num_class=2, class_names=['bus', 'trailer']), + dict(num_class=1, class_names=['barrier']), + dict(num_class=2, class_names=['motorcycle', 'bicycle']), + dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), + ], + common_heads=dict( + reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), + share_conv_channel=64, + bbox_coder=dict( + type='CenterPointBBoxCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_num=500, + score_threshold=0.1, + out_size_factor=4, + voxel_size=voxel_size[:2], + code_size=9), + separate_head=dict( + type='SeparateHead', init_bias=-2.19, final_kernel=3), + loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'), + loss_bbox=dict( + type='mmdet.L1Loss', reduction='mean', loss_weight=0.25), + norm_bbox=True), + # model training and testing settings + train_cfg=dict( + pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + out_size_factor=4, + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), + test_cfg=dict( + pts=dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + max_pool_nms=False, + min_radius=[4, 12, 10, 1, 0.85, 0.175], + score_threshold=0.1, + pc_range=[-51.2, -51.2], + out_size_factor=4, + voxel_size=voxel_size[:2], + nms_type='rotate', + pre_max_size=1000, + post_max_size=83, + nms_thr=0.2))) diff --git a/configs/_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py b/configs/_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py new file mode 100644 index 000000000..c36e7de26 --- /dev/null +++ b/configs/_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py @@ -0,0 +1,46 @@ +model = dict( + type='VoxelNet', + data_preprocessor=dict( + type='Det3DDataPreprocessor', + voxel=True, + voxel_layer=dict( + max_num_points=5, + point_cloud_range=[0, -40, -3, 70.4, 40, 1], + voxel_size=[0.05, 0.05, 0.1], + max_voxels=(16000, 40000))), + voxel_encoder=dict(type='HardSimpleVFE'), + middle_encoder=dict( + type='SparseEncoder', + in_channels=4, + sparse_shape=[41, 1600, 1408], + order=('conv', 'norm', 'act')), + backbone=dict( + type='SQUEEZE', + in_channels=3, + out_channels=[64, 128, 256, 512], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + neck=dict( + type='SQUEEZEFPN', + in_channels=[64, 128, 256, 512], + out_channels=[256, 256, 256, 256], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + conv_cfg=dict(type='Conv2d', bias=False)), + bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='Anchor3DRangeGenerator', + ranges=[[0, -40, -1.8, 70.4, 40, -1.8]], + sizes=[[1.6, 3.9, 1.56]], + rotations=[0, 1.57]), + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0), + loss_dir=dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + train_cfg=dict(assigner=dict(type='MaxIoUAssigner')), + test_cfg=dict(use_rotate_nms=True, nms_across_levels=False, nms_pre=1000, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, max_num=500) +) diff --git a/configs/centerpoint/centerpoint_pillar02_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py b/configs/centerpoint/centerpoint_pillar02_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py new file mode 100644 index 000000000..31e3b599e --- /dev/null +++ b/configs/centerpoint/centerpoint_pillar02_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py @@ -0,0 +1,253 @@ +_base_ = [ + '../_base_/datasets/nus-3d.py', + '../_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py', + '../_base_/schedules/cyclic-20e.py', '../_base_/default_runtime.py' +] + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +# Using calibration info convert the Lidar-coordinate point cloud range to the +# ego-coordinate point cloud range could bring a little promotion in nuScenes. +# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0] +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +data_prefix = dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP') +model = dict( + data_preprocessor=dict( + voxel_layer=dict(point_cloud_range=point_cloud_range)), + pts_voxel_encoder=dict(point_cloud_range=point_cloud_range), + pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])), + # model training and testing settings + train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)), + test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))) + +dataset_type = 'NuScenesDataset' +data_root = 'data/nuscenes/' +backend_args = None + +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'nuscenes_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5)), + classes=class_names, + sample_groups=dict( + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args), + backend_args=backend_args) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + backend_args=backend_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=9, + use_dim=[0, 1, 2, 3, 4], + pad_empty_sweeps=True, + remove_close=True, + backend_args=backend_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + dict( + type='Pack3DDetInputs', + keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + backend_args=backend_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=9, + use_dim=[0, 1, 2, 3, 4], + pad_empty_sweeps=True, + remove_close=True, + backend_args=backend_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D') + ]), + dict(type='Pack3DDetInputs', keys=['points']) +] + +train_dataloader = dict( + batch_size=4, + dataset=dict( + ann_file='nuscenes_infos_train.pkl', + backend_args=None, + box_type_3d='LiDAR', + data_prefix=dict( + img='', pts='samples/LIDAR_TOP', sweeps='sweeps/LIDAR_TOP'), + data_root='data/nuscenes/', + metainfo=dict(classes=[ + 'car', + 'truck', + 'construction_vehicle', + 'bus', + 'trailer', + 'barrier', + 'motorcycle', + 'bicycle', + 'pedestrian', + 'traffic_cone', + ]), + modality=dict(use_camera=False, use_lidar=True), + pipeline=[ + dict( + backend_args=None, + coord_type='LIDAR', + load_dim=5, + type='LoadPointsFromFile', + use_dim=5), + dict( + backend_args=None, + pad_empty_sweeps=True, + remove_close=True, + sweeps_num=9, + type='LoadPointsFromMultiSweeps', + use_dim=[ + 0, + 1, + 2, + 3, + 4, + ]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True), + dict( + rot_range=[ + -0.3925, + 0.3925, + ], + scale_ratio_range=[ + 0.95, + 1.05, + ], + translation_std=[ + 0, + 0, + 0, + ], + type='GlobalRotScaleTrans'), + dict( + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5, + sync_2d=False, + type='RandomFlip3D'), + dict( + point_cloud_range=[ + -51.2, + -51.2, + -5.0, + 51.2, + 51.2, + 3.0, + ], + type='PointsRangeFilter'), + dict( + point_cloud_range=[ + -51.2, + -51.2, + -5.0, + 51.2, + 51.2, + 3.0, + ], + type='ObjectRangeFilter'), + dict( + classes=[ + 'car', + 'truck', + 'construction_vehicle', + 'bus', + 'trailer', + 'barrier', + 'motorcycle', + 'bicycle', + 'pedestrian', + 'traffic_cone', + ], + type='ObjectNameFilter'), + dict(type='PointShuffle'), + dict( + keys=[ + 'points', + 'gt_bboxes_3d', + 'gt_labels_3d', + ], + type='Pack3DDetInputs'), + ], + test_mode=False, + type='NuScenesDataset', + use_valid_flag=True), + num_workers=4, + persistent_workers=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names))) +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names))) + +train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=20) diff --git a/configs/centerpoint/centerpoint_voxel01_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py b/configs/centerpoint/centerpoint_voxel01_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py new file mode 100644 index 000000000..6c423cbbc --- /dev/null +++ b/configs/centerpoint/centerpoint_voxel01_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py @@ -0,0 +1,160 @@ +_base_ = [ + '../_base_/datasets/nus-3d.py', + '../_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py', + '../_base_/schedules/cyclic-20e.py', '../_base_/default_runtime.py' +] + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +# Using calibration info convert the Lidar-coordinate point cloud range to the +# ego-coordinate point cloud range could bring a little promotion in nuScenes. +# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0] +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +data_prefix = dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP') +model = dict( + data_preprocessor=dict( + voxel_layer=dict(point_cloud_range=point_cloud_range)), + pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])), + # model training and testing settings + train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)), + test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))) + +dataset_type = 'NuScenesDataset' +data_root = 'data/nuscenes/' +backend_args = None + +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'nuscenes_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5)), + classes=class_names, + sample_groups=dict( + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args), + backend_args=backend_args) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + backend_args=backend_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=9, + use_dim=[0, 1, 2, 3, 4], + pad_empty_sweeps=True, + remove_close=True, + backend_args=backend_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + dict( + type='Pack3DDetInputs', + keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + backend_args=backend_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=9, + use_dim=[0, 1, 2, 3, 4], + pad_empty_sweeps=True, + remove_close=True, + backend_args=backend_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range) + ]), + dict(type='Pack3DDetInputs', keys=['points']) +] + +train_dataloader = dict( + _delete_=True, + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CBGSDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + metainfo=dict(classes=class_names), + test_mode=False, + data_prefix=data_prefix, + use_valid_flag=True, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR', + backend_args=backend_args))) +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names))) +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names))) + +train_cfg = dict(val_interval=20) diff --git a/mmdet3d/models/backbones/__init__.py b/mmdet3d/models/backbones/__init__.py index 64102bec1..c00d1984e 100644 --- a/mmdet3d/models/backbones/__init__.py +++ b/mmdet3d/models/backbones/__init__.py @@ -12,10 +12,11 @@ from .pointnet2_sa_ssg import PointNet2SASSG from .second import SECOND from .spvcnn_backone import MinkUNetBackboneV2, SPVCNNBackbone +from .squeezenet import SQUEEZE __all__ = [ 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'NoStemRegNet', 'SECOND', 'DGCNNBackbone', 'PointNet2SASSG', 'PointNet2SAMSG', 'MultiBackbone', 'DLANet', 'MinkResNet', 'Asymm3DSpconv', - 'MinkUNetBackbone', 'SPVCNNBackbone', 'MinkUNetBackboneV2' + 'MinkUNetBackbone', 'SPVCNNBackbone', 'MinkUNetBackboneV2','SQUEEZE' ] diff --git a/mmdet3d/models/backbones/squeezenet.py b/mmdet3d/models/backbones/squeezenet.py new file mode 100644 index 000000000..c42393b2d --- /dev/null +++ b/mmdet3d/models/backbones/squeezenet.py @@ -0,0 +1,112 @@ +from mmengine.model import BaseModule +from mmdet3d.registry import MODELS +from mmcv.cnn import build_conv_layer, build_norm_layer +import torch +import torch.nn as nn +from typing import Sequence, Optional, Tuple +from torch import Tensor +#from torch import nn +@MODELS.register_module() +class SQUEEZE(BaseModule): + """Backbone network using SqueezeNet architecture. + + Args: + in_channels (int): Input channels. + out_channels (list[int]): Output channels for multi-scale feature maps. + norm_cfg (dict): Config dict of normalization layers. + conv_cfg (dict): Config dict of convolutional layers. + """ + + def __init__(self, + in_channels: int = 3, + out_channels: Sequence[int] = [64, 128, 256], + norm_cfg: dict = dict(type='BN', eps=1e-3, momentum=0.01), + conv_cfg: dict = dict(type='Conv2d', bias=False), + init_cfg: Optional[dict] = None, + pretrained: Optional[str] = None) -> None: + super(SQUEEZE, self).__init__(init_cfg=init_cfg) + self.conv_cfg = conv_cfg; + self.norm_cfg = norm_cfg; + + + # Define the SqueezeNet fire modules + self.features = nn.Sequential( + #build_conv_layer(conv_cfg, in_channels, 96, kernel_size=7, stride=2), + build_conv_layer(conv_cfg, in_channels, 64, kernel_size=3, stride=2), + #build_norm_layer(norm_cfg, 96)[1], + build_norm_layer(norm_cfg, 64)[1], + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), + #self._make_fire_module(96, 16, 64, 64), + self._make_fire_module(64, 16, 64, 64), + self._make_fire_module(128, 16, 64, 64), + self._make_fire_module(128, 32, 128, 128), + nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), + self._make_fire_module(256, 32, 128, 128), + self._make_fire_module(256, 48, 192, 192), + self._make_fire_module(384, 48, 192, 192), + self._make_fire_module(384, 64, 256, 256), + nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True), + self._make_fire_module(512, 64, 256, 256), + ) + + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + else: + self.init_cfg = dict(type='Kaiming', layer='Conv2d') + + + + def _make_fire_module(self, in_channels, squeeze_channels, expand1x1_channels, expand3x3_channels): + layers = nn.Sequential() + + # Squeeze layer + squeeze = nn.Sequential( + build_conv_layer(self.conv_cfg, in_channels, squeeze_channels, kernel_size=1), + build_norm_layer(self.norm_cfg, squeeze_channels)[1], + nn.ReLU(inplace=True) + ) + layers.add_module('squeeze', squeeze) + + # Expand 1x1 layer + expand1x1 = nn.Sequential( + build_conv_layer(self.conv_cfg, squeeze_channels, expand1x1_channels, kernel_size=1), + build_norm_layer(self.norm_cfg, expand1x1_channels)[1], + nn.ReLU(inplace=True) + ) + layers.add_module('expand1x1', expand1x1) + + # Expand 3x3 layer + expand3x3 = nn.Sequential( + build_conv_layer(self.conv_cfg, squeeze_channels, expand3x3_channels, kernel_size=3, padding=1), + build_norm_layer(self.norm_cfg, expand3x3_channels)[1], + nn.ReLU(inplace=True) + ) + layers.add_module('expand3x3', expand3x3) + + # Concatenation layer (handled within the forward function) + return layers + + def forward(self, x): + """Forward function with correct concatenation for Fire modules.""" + x = self.features[0](x) # handled here as the initial layers are not fire modules + targeted_layers = [1,5, 8, 13] + outs = [] + for idx, layer in enumerate(self.features[1:], 1): + #print(idx,":",layer) + if isinstance(layer, nn.Sequential) and 'squeeze' in layer._modules: + # This is a Fire module, handle separately + squeeze_output = layer.squeeze(x) + x1 = layer.expand1x1(squeeze_output) + x3 = layer.expand3x3(squeeze_output) + x = torch.cat([x1, x3], 1) + else: + # Normal layer + x = layer(x) + if(idx in targeted_layers): + outs.append(x) + #print("Outs x",idx , x.shape) + #print(len(outs)) + return outs diff --git a/mmdet3d/models/layers/norm.py b/mmdet3d/models/layers/norm.py index 9a8527872..03f4950f1 100644 --- a/mmdet3d/models/layers/norm.py +++ b/mmdet3d/models/layers/norm.py @@ -120,6 +120,8 @@ def forward(self, input: Tensor) -> Tensor: Returns: Tensor: Has shape (N, C, H, W), same shape as input. """ + if input.dtype == torch.float16: + input = input.to(torch.float32) # casting to torch.float32 assert input.dtype == torch.float32, \ f'input should be in float32 type, got {input.dtype}' using_dist = dist.is_available() and dist.is_initialized() diff --git a/mmdet3d/models/necks/__init__.py b/mmdet3d/models/necks/__init__.py index 53b885cb1..fb60020e4 100644 --- a/mmdet3d/models/necks/__init__.py +++ b/mmdet3d/models/necks/__init__.py @@ -5,8 +5,9 @@ from .imvoxel_neck import IndoorImVoxelNeck, OutdoorImVoxelNeck from .pointnet2_fp_neck import PointNetFPNeck from .second_fpn import SECONDFPN +from .squeeze_fpn import SQUEEZEFPN __all__ = [ 'FPN', 'SECONDFPN', 'OutdoorImVoxelNeck', 'PointNetFPNeck', 'DLANeck', - 'IndoorImVoxelNeck' + 'IndoorImVoxelNeck','SQUEEZEFPN' ] diff --git a/mmdet3d/models/necks/squeeze_fpn.py b/mmdet3d/models/necks/squeeze_fpn.py new file mode 100644 index 000000000..7308e61f3 --- /dev/null +++ b/mmdet3d/models/necks/squeeze_fpn.py @@ -0,0 +1,110 @@ +import torch +from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer +from mmengine.model import BaseModule +from torch import nn + +from mmdet3d.registry import MODELS + + +class LastLevelMaxPool(nn.Module): + def __init__(self): + super(LastLevelMaxPool, self).__init__() + self.pool = nn.MaxPool2d(kernel_size=1, stride=2, padding=0) + + def forward(self, x): + return self.pool(x) + +@MODELS.register_module() +class SQUEEZEFPN(BaseModule): + """FPN using SqueezeNet architecture. + + Args: + in_channels (list[int]): Input channels of multi-scale feature maps. + out_channels (list[int]): Output channels of feature maps. + norm_cfg (dict): Config dict of normalization layers. + upsample_cfg (dict): Config dict of upsample layers. + conv_cfg (dict): Config dict of conv layers. + init_cfg (dict or :obj:`ConfigDict` or list[dict or :obj:`ConfigDict`], + optional): Initialization config dict. + """ + + def __init__(self, + in_channels=[64, 128, 256, 512], + out_channels=[256, 256, 256, 256], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + conv_cfg=dict(type='Conv2d', bias=False), + init_cfg=None): + super(SQUEEZEFPN, self).__init__(init_cfg=init_cfg) + print("out_channels", len(out_channels), "in_channels", len(in_channels)) + print("out_channels", out_channels, len(out_channels), "in_channels", in_channels, len(in_channels)) + assert len(out_channels) == len(in_channels) + self.in_channels = in_channels + self.out_channels = out_channels + + + self.lateral_convs = nn.ModuleList([ + nn.Conv2d(in_channel, out_channels[0], kernel_size=1) + for in_channel in in_channels + ]) + self.fpn_convs = nn.ModuleList([ + nn.Conv2d(out_channels[0], out_channels[0], kernel_size=3, padding=1) + for _ in range(len(in_channels)) + ]) + self.last_level_pool = LastLevelMaxPool() + + # self.deblocks = nn.ModuleList() + # for i, out_channel in enumerate(out_channels): + # upsample_layer = build_upsample_layer( + # upsample_cfg, + # in_channels=in_channels[i], + # out_channels=out_channel, + # kernel_size=2, + # stride=2) + # deblock = nn.Sequential( + # upsample_layer, + # build_norm_layer(norm_cfg, out_channel)[1], + # nn.ReLU(inplace=True) + # ) + # self.deblocks.append(deblock) + + def forward(self, x): + """Forward function. + + Args: + x (List[torch.Tensor]): Multi-level features with 4D Tensor in + (N, C, H, W) shape. + + Returns: + list[torch.Tensor]: Multi-level feature maps. + """ + # print("x", len(x), "in_channels", len(self.in_channels)) + assert len(x) == len(self.in_channels) + + lateral_features = [lateral_conv(feat) for lateral_conv, feat in zip(self.lateral_convs, x)] + + for i in range(len(lateral_features) - 2, -1, -1): + # print(i) + # print(i,lateral_features[i].shape) + # print(i+1,lateral_features[i+1].shape) + shape_of_tensor = lateral_features[i].size() + + # Extract specific dimensions for upsampleing + batch_size = shape_of_tensor[0] # not using + y_dimension = shape_of_tensor[1] + x_height = shape_of_tensor[2] + x_width = shape_of_tensor[3] + lateral_features[i] += nn.functional.interpolate(lateral_features[i + 1], size=(x_height, x_width), mode='nearest') + #print(i,lateral_features[i].shape) + + + # Apply the FPN convolutions + fpn_features = [fpn_conv(feat) for fpn_conv, feat in zip(self.fpn_convs, lateral_features)] + # for i, feature in enumerate(fpn_features): + # print(f"FPN Feature {i} shape: {feature.shape}") + pool = self.last_level_pool(lateral_features[0]) + # fpn_features.append(pool) + # for i, feature in enumerate(fpn_features): + # print(f"FPN Feature {i} shape: {feature.shape}") + # print(pool.shape) + return [pool] From 7e16ec14c54f323d95493317f497801b2a603a23 Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 13:59:27 -0700 Subject: [PATCH 02/12] fix: update squeeze_fpn --- mmdet3d/models/necks/squeeze_fpn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdet3d/models/necks/squeeze_fpn.py b/mmdet3d/models/necks/squeeze_fpn.py index 7308e61f3..1d50277e4 100644 --- a/mmdet3d/models/necks/squeeze_fpn.py +++ b/mmdet3d/models/necks/squeeze_fpn.py @@ -107,4 +107,4 @@ def forward(self, x): # for i, feature in enumerate(fpn_features): # print(f"FPN Feature {i} shape: {feature.shape}") # print(pool.shape) - return [pool] + return tuple(lateral_features) From 9324095f0c7db093c08639ab7eb5584025455de9 Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 14:02:16 -0700 Subject: [PATCH 03/12] feat: custom firerpfnet backbone for lidar --- mmdet3d/models/backbones/fire_rpfnet.py | 98 +++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 mmdet3d/models/backbones/fire_rpfnet.py diff --git a/mmdet3d/models/backbones/fire_rpfnet.py b/mmdet3d/models/backbones/fire_rpfnet.py new file mode 100644 index 000000000..ae5a9d734 --- /dev/null +++ b/mmdet3d/models/backbones/fire_rpfnet.py @@ -0,0 +1,98 @@ +import torch +from torch import nn +from mmcv.cnn import build_norm_layer +from mmdet3d.registry import MODELS + + +class FireBlock(nn.Module): + """SqueezeNet‐style fire module with residual shortcut. + + Args: + in_ch (int): #input channels. + out_ch (int): #output channels of the expand concat. + norm_cfg (dict): Normalisation config. + """ + + def __init__(self, in_ch, out_ch, norm_cfg): + super().__init__() + squeeze_ch = max(16, in_ch // 4) + self.squeeze = nn.Conv2d(in_ch, squeeze_ch, kernel_size=1, bias=False) + self.squeeze_bn = build_norm_layer(norm_cfg, squeeze_ch)[1] + # expand paths + self.expand1x1 = nn.Conv2d(squeeze_ch, out_ch // 2, 1, bias=False) + self.expand3x3 = nn.Conv2d(squeeze_ch, out_ch // 2, 3, padding=1, bias=False) + self.expand_bn = build_norm_layer(norm_cfg, out_ch)[1] + self.act = nn.ReLU(inplace=True) + self.downsample = None + if in_ch != out_ch: + self.downsample = nn.Sequential( + nn.Conv2d(in_ch, out_ch, 1, bias=False), + build_norm_layer(norm_cfg, out_ch)[1], + ) + + def forward(self, x): + identity = x + x = self.act(self.squeeze_bn(self.squeeze(x))) + out1 = self.expand1x1(x) + out3 = self.expand3x3(x) + out = torch.cat([out1, out3], dim=1) + out = self.expand_bn(out) + if self.downsample is not None: + identity = self.downsample(identity) + return self.act(out + identity) + + +class CBAM(nn.Module): + """Channel & spatial attention.""" + + def __init__(self, ch, reduction=16): + super().__init__() + self.mlp = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Flatten(), + nn.Linear(ch, ch // reduction, bias=False), + nn.ReLU(inplace=True), + nn.Linear(ch // reduction, ch, bias=False), + nn.Sigmoid(), + ) + self.spatial = nn.Sequential( + nn.Conv2d(2, 1, 7, padding=3, bias=False), + nn.Sigmoid(), + ) + + def forward(self, x): + b, c, _, _ = x.size() + att_c = self.mlp(x).view(b, c, 1, 1) + x = x * att_c + att_s = self.spatial(torch.cat([x.mean(1, keepdim=True), x.max(1, keepdim=True)[0]], dim=1)) + return x * att_s + + +@MODELS.register_module() +class FireRPFNet(nn.Module): + """Residual FireNet backbone (SqueezeNet-inspired) with CBAM. + + Designed as a drop-in replacement for RPFNet in BEV pipelines. + """ + + def __init__(self, + in_channels=256, + layer_channels=(128, 256, 256, 256), + with_cbam=True, + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01)): + super().__init__() + layers = [] + ch = in_channels + for out_ch in layer_channels: + block = FireBlock(ch, out_ch, norm_cfg) + stage = [block] + if with_cbam: + stage.append(CBAM(out_ch)) + layers.append(nn.Sequential(*stage)) + ch = out_ch + self.stages = nn.ModuleList(layers) + + def forward(self, x): + for stage in self.stages: + x = stage(x) + return (x, ) \ No newline at end of file From ac7e65141c1f1bb8294a5ccfbeb737ef5dccc0be Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 14:03:36 -0700 Subject: [PATCH 04/12] fix: adding new FireRPFNet backbone --- mmdet3d/models/backbones/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mmdet3d/models/backbones/__init__.py b/mmdet3d/models/backbones/__init__.py index c00d1984e..8d8b5e3c2 100644 --- a/mmdet3d/models/backbones/__init__.py +++ b/mmdet3d/models/backbones/__init__.py @@ -4,6 +4,7 @@ from .cylinder3d import Asymm3DSpconv from .dgcnn import DGCNNBackbone from .dla import DLANet +from .fire_rpfnet import FireRPFNet from .mink_resnet import MinkResNet from .minkunet_backbone import MinkUNetBackbone from .multi_backbone import MultiBackbone @@ -18,5 +19,6 @@ 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'NoStemRegNet', 'SECOND', 'DGCNNBackbone', 'PointNet2SASSG', 'PointNet2SAMSG', 'MultiBackbone', 'DLANet', 'MinkResNet', 'Asymm3DSpconv', - 'MinkUNetBackbone', 'SPVCNNBackbone', 'MinkUNetBackboneV2','SQUEEZE' + 'MinkUNetBackbone', 'SPVCNNBackbone', 'MinkUNetBackboneV2','SQUEEZE', + 'FireRPFNet' ] From a08c72651bc19be0b516e354be618b3e3bb67507 Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 14:05:36 -0700 Subject: [PATCH 05/12] fix: updating camera loading for kitti --- mmdet3d/datasets/transforms/loading.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mmdet3d/datasets/transforms/loading.py b/mmdet3d/datasets/transforms/loading.py index 383c44536..3b8cb0e33 100644 --- a/mmdet3d/datasets/transforms/loading.py +++ b/mmdet3d/datasets/transforms/loading.py @@ -244,6 +244,9 @@ def transform(self, results: dict) -> dict: if 'CAM2' in results['images']: filename = results['images']['CAM2']['img_path'] results['cam2img'] = results['images']['CAM2']['cam2img'] + elif 'CAM_FRONT' in results['images']: + filename = results['images']['CAM_FRONT']['img_path'] + results['cam2img'] = results['images']['CAM_FRONT']['cam2img'] elif len(list(results['images'].keys())) == 1: camera_type = list(results['images'].keys())[0] filename = results['images'][camera_type]['img_path'] From 49352970bfe13078fc51274d389bce96620a7faf Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 14:07:02 -0700 Subject: [PATCH 06/12] fix: updating multi_modality_det3d_inferencer --- .../multi_modality_det3d_inferencer.py | 206 ++++++++++++------ 1 file changed, 144 insertions(+), 62 deletions(-) diff --git a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py index 6717bb18c..ecc4295ef 100644 --- a/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py +++ b/mmdet3d/apis/inferencers/multi_modality_det3d_inferencer.py @@ -74,95 +74,177 @@ def _inputs_to_list(self, - dict: the value with key 'points' is - Directory path: return all files in the directory - other cases: return a list containing the string. The string - could be a path to file, a url or other types of string according - to the task. + could be a path to file, a url or other types of string according + to the task. Args: inputs (Union[dict, list]): Inputs for the inferencer. + cam_type (str): Camera type. Defaults to 'CAM2'. Returns: list: List of input for the :meth:`preprocess`. """ + processed_inputs_list = [] + if isinstance(inputs, dict): - assert 'infos' in inputs - infos = inputs.pop('infos') - - if isinstance(inputs['img'], str): - img, pcd = inputs['img'], inputs['points'] - backend = get_file_backend(img) - if hasattr(backend, 'isdir') and isdir(img) and isdir(pcd): - # Backends like HttpsBackend do not implement `isdir`, so - # only those backends that implement `isdir` could accept - # the inputs as a directory + if 'infos' not in inputs: + raise ValueError("Input dictionary must contain an 'infos' key pointing to the .pkl file.") + infos_path = inputs.pop('infos') + + # Determine the actual list of input samples + # This handles cases where 'img' and 'pcd' might be directories + current_sample_dicts = [] + if isinstance(inputs.get('img'), str) and isinstance(inputs.get('points'), str): + img_path_input, pcd_path_input = inputs['img'], inputs['points'] + # Check if these are directories + backend = get_file_backend(img_path_input) + if hasattr(backend, 'isdir') and isdir(img_path_input) and isdir(pcd_path_input): img_filename_list = list_dir_or_file( - img, list_dir=False, suffix=['.png', '.jpg']) + img_path_input, list_dir=False, suffix=['.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG']) # Added more suffixes pcd_filename_list = list_dir_or_file( - pcd, list_dir=False, suffix='.bin') - assert len(img_filename_list) == len(pcd_filename_list) - - inputs = [{ - 'img': join_path(img, img_filename), - 'points': join_path(pcd, pcd_filename) - } for pcd_filename, img_filename in zip( - pcd_filename_list, img_filename_list)] - - if not isinstance(inputs, (list, tuple)): - inputs = [inputs] - - # get cam2img, lidar2cam and lidar2img from infos - info_list = mmengine.load(infos)['data_list'] - assert len(info_list) == len(inputs) - for index, input in enumerate(inputs): - data_info = info_list[index] - img_path = data_info['images'][cam_type]['img_path'] - if isinstance(input['img'], str) and \ - osp.basename(img_path) != osp.basename(input['img']): + pcd_path_input, list_dir=False, suffix='.bin') + + if len(img_filename_list) != len(pcd_filename_list): + raise ValueError( + f"Mismatch in number of images ({len(img_filename_list)}) and " + f"point cloud files ({len(pcd_filename_list)}) " + f"in directories '{img_path_input}' and '{pcd_path_input}'.") + + for pcd_filename, img_filename in zip(pcd_filename_list, img_filename_list): + current_sample_dicts.append({ + 'img': join_path(img_path_input, img_filename), + 'points': join_path(pcd_path_input, pcd_filename) + }) + else: # Assume single file paths if not directories + current_sample_dicts = [inputs.copy()] # Use a copy of the original input dict + elif not isinstance(inputs, (list, tuple)): # If inputs['img'] wasn't a string, but inputs itself is a dict. + current_sample_dicts = [inputs.copy()] + else: # This case should ideally not be hit if input 'inputs' is a dict. + raise ValueError("Unexpected structure for 'inputs' dictionary.") + + + all_info_data = mmengine.load(infos_path)['data_list'] + + for single_input_sample_dict in current_sample_dicts: + if 'img' not in single_input_sample_dict or not isinstance(single_input_sample_dict['img'], str): + raise ValueError(f"Each input sample must have an 'img' key with a string path. Problematic sample: {single_input_sample_dict}") + + input_img_basename = osp.basename(single_input_sample_dict['img']) + found_data_info = None + + for data_info_candidate in all_info_data: + if 'images' not in data_info_candidate or \ + cam_type not in data_info_candidate['images'] or \ + 'img_path' not in data_info_candidate['images'][cam_type]: + # Silently skip malformed entries or log a warning + # warnings.warn(f"Skipping malformed info entry: {data_info_candidate.get('sample_idx', 'Unknown sample')}") + continue + + info_img_path = data_info_candidate['images'][cam_type]['img_path'] + if osp.basename(info_img_path) == input_img_basename: + found_data_info = data_info_candidate + break + + if found_data_info is None: + available_img_names = [ + osp.basename(info['images'][cam_type]['img_path']) + for info in all_info_data + if 'images' in info and cam_type in info['images'] and 'img_path' in info['images'][cam_type] + ] + example_names = ", ".join(list(set(available_img_names))[:5]) raise ValueError( - f'the info file of {img_path} is not provided.') + f"Could not find info for image '{input_img_basename}' (from path: {single_input_sample_dict['img']}) " + f"in '{infos_path}'. Checked {len(all_info_data)} entries. " + f"Example image basenames in info file: {example_names}" + ) + + # Add camera parameters from found_data_info to the input sample cam2img = np.asarray( - data_info['images'][cam_type]['cam2img'], dtype=np.float32) + found_data_info['images'][cam_type]['cam2img'], dtype=np.float32) lidar2cam = np.asarray( - data_info['images'][cam_type]['lidar2cam'], + found_data_info['images'][cam_type]['lidar2cam'], dtype=np.float32) - if 'lidar2img' in data_info['images'][cam_type]: + if 'lidar2img' in found_data_info['images'][cam_type]: lidar2img = np.asarray( - data_info['images'][cam_type]['lidar2img'], + found_data_info['images'][cam_type]['lidar2img'], dtype=np.float32) else: lidar2img = cam2img @ lidar2cam - input['cam2img'] = cam2img - input['lidar2cam'] = lidar2cam - input['lidar2img'] = lidar2img + + # Create a new dict for the processed input to avoid modifying the original list's dicts + processed_sample = single_input_sample_dict.copy() + processed_sample['cam2img'] = cam2img + processed_sample['lidar2cam'] = lidar2cam + processed_sample['lidar2img'] = lidar2img + processed_inputs_list.append(processed_sample) + elif isinstance(inputs, (list, tuple)): - # get cam2img, lidar2cam and lidar2img from infos - for input in inputs: - assert 'infos' in input - infos = input.pop('infos') - info_list = mmengine.load(infos)['data_list'] - assert len(info_list) == 1, 'Only support single sample' \ - 'info in `.pkl`, when input is a list.' - data_info = info_list[0] - img_path = data_info['images'][cam_type]['img_path'] - if isinstance(input['img'], str) and \ - osp.basename(img_path) != osp.basename(input['img']): + # This branch handles cases where 'inputs' is already a list of dicts. + # The original logic assumes each dict in the list has its own 'infos' + # and that this info file contains exactly one entry. + # This part is kept similar to original for now, but may need adjustment + # if a global info file is to be used for list inputs too. + for single_input_item_dict in inputs: + if not isinstance(single_input_item_dict, dict) or 'infos' not in single_input_item_dict: + raise ValueError("When inputs is a list, each item must be a dict containing an 'infos' key.") + + infos_path_item = single_input_item_dict.pop('infos') + current_info_list = mmengine.load(infos_path_item)['data_list'] + + # Original code for list inputs expects one info entry per file. + # To make it search, you'd adapt the logic from the isinstance(inputs, dict) block above. + # For now, sticking to a modified version of the original assertion for clarity. + input_img_basename_item = osp.basename(single_input_item_dict['img']) + data_info_to_use = None + if len(current_info_list) == 1: + # If only one entry, check if it matches, then use it. + candidate = current_info_list[0] + if 'images' in candidate and cam_type in candidate['images'] and \ + osp.basename(candidate['images'][cam_type]['img_path']) == input_img_basename_item: + data_info_to_use = candidate + else: + raise ValueError( + f"Single info entry in '{infos_path_item}' does not match input image '{input_img_basename_item}'.") + else: + # If multiple entries, search for the right one. + for candidate in current_info_list: + if 'images' in candidate and cam_type in candidate['images'] and \ + osp.basename(candidate['images'][cam_type]['img_path']) == input_img_basename_item: + data_info_to_use = candidate + break + if data_info_to_use is None: + raise ValueError( + f"Could not find matching info for image '{input_img_basename_item}' in '{infos_path_item}' " + f"(which has {len(current_info_list)} entries) when inputs is a list.") + + # Consistency check (original) + img_path_from_info = data_info_to_use['images'][cam_type]['img_path'] + if isinstance(single_input_item_dict.get('img'), str) and \ + osp.basename(img_path_from_info) != osp.basename(single_input_item_dict['img']): raise ValueError( - f'the info file of {img_path} is not provided.') + f"Mismatch: info file '{img_path_from_info}' vs input image '{single_input_item_dict['img']}'.") + cam2img = np.asarray( - data_info['images'][cam_type]['cam2img'], dtype=np.float32) + data_info_to_use['images'][cam_type]['cam2img'], dtype=np.float32) lidar2cam = np.asarray( - data_info['images'][cam_type]['lidar2cam'], + data_info_to_use['images'][cam_type]['lidar2cam'], dtype=np.float32) - if 'lidar2img' in data_info['images'][cam_type]: + if 'lidar2img' in data_info_to_use['images'][cam_type]: lidar2img = np.asarray( - data_info['images'][cam_type]['lidar2img'], + data_info_to_use['images'][cam_type]['lidar2img'], dtype=np.float32) else: lidar2img = cam2img @ lidar2cam - input['cam2img'] = cam2img - input['lidar2cam'] = lidar2cam - input['lidar2img'] = lidar2img - - return list(inputs) + + processed_sample = single_input_item_dict.copy() + processed_sample['cam2img'] = cam2img + processed_sample['lidar2cam'] = lidar2cam + processed_sample['lidar2img'] = lidar2img + processed_inputs_list.append(processed_sample) + else: + raise TypeError(f"Unsupported input type: {type(inputs)}. Expected dict or list.") + + return processed_inputs_list def _init_pipeline(self, cfg: ConfigType) -> Compose: """Initialize the test pipeline.""" From ec0436adf1ed992f308d6c3a90e89cc3a9ee8ef8 Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 14:09:01 -0700 Subject: [PATCH 07/12] feat: custom backbone based new model --- ...t_sqeezefpn_fire_rpfnet_kitti-3d-3class.py | 220 ++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py diff --git a/configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py b/configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py new file mode 100644 index 000000000..26c139620 --- /dev/null +++ b/configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py @@ -0,0 +1,220 @@ +# MVX-Net (SqueezeFPN camera branch + FireRPFNet lidar) +# for KITTI 3-class. + +_base_ = ['../_base_/schedules/cosine.py', '../_base_/default_runtime.py'] + +# ----------------------------------------------------------------------------- +# Geometry +# ----------------------------------------------------------------------------- +voxel_size = [0.05, 0.05, 0.1] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] + +# ----------------------------------------------------------------------------- +# Model +# ----------------------------------------------------------------------------- +model = dict( + type='DynamicMVXFasterRCNN', + # -------------------------------------------------- + data_preprocessor=dict( + type='Det3DDataPreprocessor', + voxel=True, + voxel_type='dynamic', + voxel_layer=dict( + max_num_points=-1, + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(-1, -1)), + mean=[102.9801, 115.9465, 122.7717], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + + # ----------------------- image branch ----------------------- + img_backbone=dict( + type='SQUEEZE', + in_channels=3, + out_channels=[64, 128, 256, 512], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + img_neck=dict( + type='SQUEEZEFPN', + in_channels=[64, 128, 256, 512], + out_channels=[512, 512, 512, 512], + norm_cfg=dict(type='BN', requires_grad=False)), + + # ----------------------- LiDAR voxel encoder ---------------- + pts_voxel_encoder=dict( + type='DynamicVFE', + in_channels=4, + feat_channels=[64, 64], + with_distance=False, + voxel_size=voxel_size, + with_cluster_center=True, + with_voxel_center=True, + point_cloud_range=point_cloud_range, + fusion_layer=dict( + type='PointFusion', + img_channels=512, + pts_channels=64, + mid_channels=128, + out_channels=128, + img_levels=[0, 1, 2, 3], + align_corners=False, + activate_out=True, + fuse_out=False)), + + # ----------------------- Sparse middle encoder -------------- + pts_middle_encoder=dict( + type='SparseEncoder', + in_channels=128, + sparse_shape=[41, 1600, 1408], + order=('conv', 'norm', 'act')), + + # ----------------------- FireRPFNet backbone ------------- + pts_backbone=dict( + type='FireRPFNet', + in_channels=256, # output of SparseEncoder + layer_channels=[128, 256, 256, 256], + with_cbam=True), + + pts_neck=None, # RPFNet is already deep enough + + # ----------------------- Anchor head ------------------------ + pts_bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=256, + feat_channels=256, + use_direction_classifier=True, + anchor_generator=dict( + type='Anchor3DRangeGenerator', + ranges=[ + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -1.78, 70.4, 40.0, -1.78], + ], + sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], + rotations=[0, 1.57], + reshape_out=False), + assigner_per_size=True, + diff_rad_by_sin=True, + assign_per_class=True, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + loss_cls=dict( + type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, + loss_weight=2.0), + loss_dir=dict(type='mmdet.CrossEntropyLoss', use_sigmoid=False, + loss_weight=0.2)), + + # ----------------------- Train / Test cfg ------------------- + train_cfg=dict( + pts=dict( + assigner=[ + dict(type='Max3DIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, + ignore_iof_thr=-1), + dict(type='Max3DIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, + ignore_iof_thr=-1), + dict(type='Max3DIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, + ignore_iof_thr=-1), + ], + allowed_border=0, pos_weight=-1, debug=False)), + + test_cfg=dict( + pts=dict(use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, + score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) +) + +# ----------------------------------------------------------------------------- +# Dataset & pipelines +# ----------------------------------------------------------------------------- + +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Pedestrian', 'Cyclist', 'Car'] +metainfo = dict(classes=class_names) +input_modality = dict(use_lidar=True, use_camera=True) +backend_args = None + +train_pipeline = [ + dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, + backend_args=backend_args), + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, + with_bbox=True, with_label=True), + dict(type='RandomResize', scale=[(320, 96), (1280, 384)], keep_ratio=True), + dict(type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05], translation_std=[0.2, 0.2, 0.2]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='Pack3DDetInputs', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', + 'gt_bboxes', 'gt_labels']) +] + +test_pipeline = [ + dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, + backend_args=backend_args), + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='MultiScaleFlipAug3D', img_scale=(1280, 384), pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='Resize', scale=0, keep_ratio=True), + dict(type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + ]), + dict(type='Pack3DDetInputs', keys=['points', 'img']) +] + +modality = dict(use_lidar=True, use_camera=True) + +train_dataloader = dict( + batch_size=2, num_workers=2, sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict(type='RepeatDataset', times=2, dataset=dict( + type=dataset_type, data_root=data_root, modality=modality, + ann_file='kitti_infos_train.pkl', + data_prefix=dict(pts='training/velodyne_reduced', img='training/image_2'), + pipeline=train_pipeline, filter_empty_gt=False, metainfo=metainfo, + box_type_3d='LiDAR', backend_args=backend_args))) + +val_dataloader = dict( + batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict(type=dataset_type, data_root=data_root, modality=modality, + ann_file='kitti_infos_val.pkl', + data_prefix=dict(pts='training/velodyne_reduced', img='training/image_2'), + pipeline=test_pipeline, metainfo=metainfo, test_mode=True, + box_type_3d='LiDAR', backend_args=backend_args)) + +test_dataloader = val_dataloader + +# ----------------------------------------------------------------------------- +# Optimizer / Schedulers / Runtime +# ----------------------------------------------------------------------------- +optim_wrapper = dict(optimizer=dict(lr=0.001, weight_decay=0.01), + clip_grad=dict(max_norm=35, norm_type=2)) + +val_evaluator = dict(type='KittiMetric', ann_file='data/kitti/kitti_infos_val.pkl') + + +# optim_wrapper = dict( +# optimizer=dict(weight_decay=0.01), +# clip_grad=dict(max_norm=35, norm_type=2), +# ) +# val_evaluator = dict( +# type='KittiMetric', ann_file='data/kitti/kitti_infos_val.pkl') + +test_evaluator = val_evaluator + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=20, val_interval=5) From 997227434f883e6a36d82bee5b77cfd508433b5b Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 14:10:15 -0700 Subject: [PATCH 08/12] README for our model --- .../mvxnet/SqueezeFPN_FireRPFNet_README.md | 171 ++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 configs/mvxnet/SqueezeFPN_FireRPFNet_README.md diff --git a/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md b/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md new file mode 100644 index 000000000..88348e612 --- /dev/null +++ b/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md @@ -0,0 +1,171 @@ +# Updated MVXNet with updated backbones(SqueezeFPN + FireRPFNet): Efficient Multi-Modal 3D Object Detection + +## Abstract + +This project focuses on developing a computationally efficient and lightweight 3D object detection model for autonomous vehicles. By experimenting with and modifying the backbone architecture of existing fusion-based models, this project aims to reduce computational demand while maintaining or improving detection accuracy. The key research involved exploring the effectiveness of lightweight architectures like SqueezeNet, EfficientNet, and MobileNet for images, whereas VoxelNext and custom models, such as Residual Pilar Feature Network (RPFNet) and Fire Residual Pilar Feature Network (FireRPFNet) for LiDAR data processing. RPFNet and Fire RPFNet are our proposed solutions, which are inspired by fire module from SqueezeNet and Convolution Block Attention Module (CBAM), to processor lidar point cloud. The potential impact of this project extends to safer and more efficient autonomous driving, contributing to the advancement of intelligent transportation systems. + +## Overview + +Our model introduces a novel, computationally efficient approach to 3D object detection through several key innovations: + +### Lightweight Image Processing +- **SqueezeFPN**: A highly efficient feature pyramid network adapted from SqueezeNet's architecture +- **Optimized Feature Extraction**: Carefully balanced network depth and width for optimal performance +- **Memory-Efficient Design**: Reduced parameter count while maintaining high feature quality + +### Advanced LiDAR Processing +- **FireRPFNet**: Our custom-designed backbone network that combines: + - Fire modules from SqueezeNet for efficient feature extraction + - Convolution Block Attention Module (CBAM) for enhanced feature focus + - Residual connections for improved gradient flow +- **Dynamic Voxel Feature Encoder**: Adaptive point cloud feature learning that adjusts to input density + +### Efficient Multi-Modal Fusion +- Early fusion strategy for optimal feature integration +- Balanced computational resource allocation between modalities +- Adaptive feature aggregation for robust object detection + +## Architecture + +
+The model consists of three main components: + +1. **Image Branch**: + - SqueezeFPN backbone + - Efficient feature pyramid with squeeze and expand layers + - Multi-scale feature maps [512, 512, 512, 512] + +2. **LiDAR Branch**: + - Dynamic voxel feature encoder + - FireRPFNet backbone with CBAM attention + - Efficient feature processing with Fire modules + +3. **Multi-modal Fusion**: + - Early fusion of image and point cloud features + - Point-wise feature fusion + - Adaptive feature aggregation +
+ +## Results on KITTI Dataset + +### AP@40 IoU Results (Car) + +| Metric | Easy | Moderate | Hard | +|:------:|:----:|:--------:|:----:| +| 3D Detection | 97.37 | 91.93 | 89.53 | +| Bird's Eye View | 97.48 | 92.29 | 89.92 | +| 2D Detection | 95.39 | 88.64 | 84.56 | +| AOS | 94.92 | 87.22 | 82.64 | + +### Comparison with State-of-the-Art + +#### 3D Detection AP@40 (%) + +| Model | Easy | Moderate | Hard | +|:-----:|:----:|:--------:|:----:| +| **Ours (SqueezeFPN+FireRPFNet)** | **97.37** | **91.93** | **89.53** | +| [TRTConv](https://www.cvlibs.net/datasets/kitti/eval_object_detail.php?&result=30bdb9fd69e93886221650a590744f76bbb3d773) | 91.90 | 85.04 | 80.38 | +| [GLENet-VR](https://paperswithcode.com/paper/glenet-boosting-3d-object-detectors-with/review/?hl=60125) | 91.67 | 83.23 | 78.43 | +| [SE-SSD](https://paperswithcode.com/paper/cia-ssd-confident-iou-aware-single-stage/review/?hl=34051) | 91.49 | 82.54 | 77.15 | + +#### Bird's Eye View AP@40 (%) + +| Model | Easy | Moderate | Hard | +|:-----:|:----:|:--------:|:----:| +| **Ours (SqueezeFPN+FireRPFNet)** | **97.48** | **92.29** | **89.92** | [Pre-trained Model (25 epochs)](https://drive.google.com/file/d/19h9m7tb1bX-W1ZViocx4J8knanz6PuLz/view?usp=drive_link) | +| [SE-SSD](https://paperswithcode.com/paper/cia-ssd-confident-iou-aware-single-stage/review/?hl=34051) | 96.59 | 92.28 | 89.72 | +| [TRTConv](https://www.cvlibs.net/datasets/kitti/eval_object_detail.php?&result=30bdb9fd69e93886221650a590744f76bbb3d773) | 95.55 | 92.04 | 87.23 | +| [PV-RCNN](https://paperswithcode.com/paper/pv-rcnn-point-voxel-feature-set-abstraction/review/?hl=13979) | 94.98 | 90.65 | 86.14 | + +## Key Features + +1. **Efficient Architecture** + - Lightweight SqueezeFPN for image feature extraction + - Memory-efficient FireRPFNet for point cloud processing + - Dynamic voxel encoding for adaptive feature learning + +2. **State-of-the-Art Performance** + - Achieves top performance in both 3D detection and Bird's Eye View + - Significant improvements over existing methods + - Robust performance across different difficulty levels + +3. **Multi-modal Fusion** + - Early fusion strategy for better feature integration + - Effective combination of image and LiDAR information + - Enhanced feature representation for accurate detection + +## Training and Testing + +### Training +```shell +# Single GPU training +python tools/train.py configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py + +# Multi-GPU training +TORCH_DISTRIBUTED_DEBUG=INFO ./tools/dist_train.sh configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py 8 +``` + +### Testing +```shell +# Single GPU testing +python tools/test.py configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py ${CHECKPOINT_FILE} + +# Multi-GPU testing +./tools/dist_test.sh configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py ${CHECKPOINT_FILE} 8 +``` + +### inference command example +update sample details before running command +``` +python demo/multi_modality_demo.py \ + data/kitti/testing/velodyne/000068.bin \ + data/kitti/testing/image_2/000068.png \ + data/kitti/kitti_infos_test.pkl \ + work_dirs/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py \ + work_dirs/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class/epoch_25.pth``` + +### Pre-trained Model + +We provide a pre-trained model for 25 epochs on the KITTI dataset: +- [Download Pre-trained Model (25 epochs)](https://drive.google.com/file/d/19h9m7tb1bX-W1ZViocx4J8knanz6PuLz/view?usp=drive_link) + + + +#### Usage Instructions +```shell +# Directly test the pre-trained model +python tools/test.py configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py /path/to/downloaded/checkpoint.pth + +# Resume training from the pre-trained model +python tools/train.py configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py --resume /path/to/downloaded/checkpoint.pth +``` + +## Model Configuration + +The model configuration can be found in: +``` +configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py +``` + +Key configuration details: +- Backbone: SqueezeFPN + FireRPFNet +- Training schedule: 25 epochs with cosine learning rate +- Input: Multi-modal (LiDAR point cloud + Camera image) +- Classes: Car detection +- Voxel size: [0.05, 0.05, 0.1] +- Point cloud range: [0, -40, -3, 70.4, 40, 1] + +## Additional Resources + +### Repository and Model Files +- [Complete Project Resources](https://drive.google.com/drive/folders/161GQlhavUursme3voNuQaF1YGtYi9E6a) + - Pre-trained models + - Logs + - Additional configuration files + - Supplementary materials + +### Quick Access +- [25-Epoch Pre-trained Model](https://drive.google.com/file/d/19h9m7tb1bX-W1ZViocx4J8knanz6PuLz/view?usp=drive_link) +- Configuration File: `configs/mvxnet/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py` + +**Note**: All resources are subject to the project's licensing terms. Please review and comply with usage guidelines. From 0378c614630d6f65290a6cbd07265506e6e187cc Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht Date: Wed, 14 May 2025 14:28:57 -0700 Subject: [PATCH 09/12] fix: removing other models not related to this experiment --- ...erpoint_pillar02_squeeze_squeezefpn_nus.py | 91 ------- ...terpoint_voxel01_squeeze_squeezefpn_nus.py | 46 ---- ...ueeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py | 253 ------------------ ...ueeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py | 160 ----------- 4 files changed, 550 deletions(-) delete mode 100644 configs/_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py delete mode 100644 configs/_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py delete mode 100644 configs/centerpoint/centerpoint_pillar02_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py delete mode 100644 configs/centerpoint/centerpoint_voxel01_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py diff --git a/configs/_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py b/configs/_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py deleted file mode 100644 index 04375a2a1..000000000 --- a/configs/_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py +++ /dev/null @@ -1,91 +0,0 @@ -voxel_size = [0.2, 0.2, 8] -model = dict( - type='CenterPoint', - data_preprocessor=dict( - type='Det3DDataPreprocessor', - voxel=True, - voxel_layer=dict( - max_num_points=20, - voxel_size=voxel_size, - max_voxels=(30000, 40000))), - pts_voxel_encoder=dict( - type='PillarFeatureNet', - in_channels=5, - feat_channels=[64], - with_distance=False, - voxel_size=(0.2, 0.2, 8), - norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), - legacy=False), - pts_middle_encoder=dict( - type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)), - pts_backbone=dict( - type='SQUEEZE', - in_channels=64, - out_channels=[64, 128, 256 , 512], - #layer_nums=[3, 5, 5], - #layer_strides=[2, 2, 2], - norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), - conv_cfg=dict(type='Conv2d', bias=False)), - pts_neck=dict( - type='SQUEEZEFPN', - in_channels=[64, 128, 256 , 512], - out_channels=[512, 512, 512, 512], - #upsample_strides=[0.5, 1, 2], - norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), - upsample_cfg=dict(type='deconv', bias=False), - #use_conv_for_no_stride=True - ), - pts_bbox_head=dict( - type='CenterHead', - in_channels=sum([128, 128, 128,128]), - #in_channels=256, - tasks=[ - dict(num_class=1, class_names=['car']), - dict(num_class=2, class_names=['truck', 'construction_vehicle']), - dict(num_class=2, class_names=['bus', 'trailer']), - dict(num_class=1, class_names=['barrier']), - dict(num_class=2, class_names=['motorcycle', 'bicycle']), - dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), - ], - common_heads=dict( - reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), - share_conv_channel=64, - bbox_coder=dict( - type='CenterPointBBoxCoder', - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - max_num=500, - score_threshold=0.1, - out_size_factor=4, - voxel_size=voxel_size[:2], - code_size=9), - separate_head=dict( - type='SeparateHead', init_bias=-2.19, final_kernel=3), - loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'), - loss_bbox=dict( - type='mmdet.L1Loss', reduction='mean', loss_weight=0.25), - norm_bbox=True), - # model training and testing settings - train_cfg=dict( - pts=dict( - grid_size=[512, 512, 1], - voxel_size=voxel_size, - out_size_factor=4, - dense_reg=1, - gaussian_overlap=0.1, - max_objs=500, - min_radius=2, - code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), - test_cfg=dict( - pts=dict( - post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - max_per_img=500, - max_pool_nms=False, - min_radius=[4, 12, 10, 1, 0.85, 0.175], - score_threshold=0.1, - pc_range=[-51.2, -51.2], - out_size_factor=4, - voxel_size=voxel_size[:2], - nms_type='rotate', - pre_max_size=1000, - post_max_size=83, - nms_thr=0.2))) diff --git a/configs/_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py b/configs/_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py deleted file mode 100644 index c36e7de26..000000000 --- a/configs/_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py +++ /dev/null @@ -1,46 +0,0 @@ -model = dict( - type='VoxelNet', - data_preprocessor=dict( - type='Det3DDataPreprocessor', - voxel=True, - voxel_layer=dict( - max_num_points=5, - point_cloud_range=[0, -40, -3, 70.4, 40, 1], - voxel_size=[0.05, 0.05, 0.1], - max_voxels=(16000, 40000))), - voxel_encoder=dict(type='HardSimpleVFE'), - middle_encoder=dict( - type='SparseEncoder', - in_channels=4, - sparse_shape=[41, 1600, 1408], - order=('conv', 'norm', 'act')), - backbone=dict( - type='SQUEEZE', - in_channels=3, - out_channels=[64, 128, 256, 512], - norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), - conv_cfg=dict(type='Conv2d', bias=False)), - neck=dict( - type='SQUEEZEFPN', - in_channels=[64, 128, 256, 512], - out_channels=[256, 256, 256, 256], - norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), - upsample_cfg=dict(type='deconv', bias=False), - conv_cfg=dict(type='Conv2d', bias=False)), - bbox_head=dict( - type='Anchor3DHead', - num_classes=3, - in_channels=256, - feat_channels=256, - anchor_generator=dict( - type='Anchor3DRangeGenerator', - ranges=[[0, -40, -1.8, 70.4, 40, -1.8]], - sizes=[[1.6, 3.9, 1.56]], - rotations=[0, 1.57]), - bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), - loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25), - loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0), - loss_dir=dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), - train_cfg=dict(assigner=dict(type='MaxIoUAssigner')), - test_cfg=dict(use_rotate_nms=True, nms_across_levels=False, nms_pre=1000, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, max_num=500) -) diff --git a/configs/centerpoint/centerpoint_pillar02_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py b/configs/centerpoint/centerpoint_pillar02_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py deleted file mode 100644 index 31e3b599e..000000000 --- a/configs/centerpoint/centerpoint_pillar02_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py +++ /dev/null @@ -1,253 +0,0 @@ -_base_ = [ - '../_base_/datasets/nus-3d.py', - '../_base_/models/centerpoint_pillar02_squeeze_squeezefpn_nus.py', - '../_base_/schedules/cyclic-20e.py', '../_base_/default_runtime.py' -] - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -# Using calibration info convert the Lidar-coordinate point cloud range to the -# ego-coordinate point cloud range could bring a little promotion in nuScenes. -# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0] -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] -data_prefix = dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP') -model = dict( - data_preprocessor=dict( - voxel_layer=dict(point_cloud_range=point_cloud_range)), - pts_voxel_encoder=dict(point_cloud_range=point_cloud_range), - pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])), - # model training and testing settings - train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)), - test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))) - -dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes/' -backend_args = None - -db_sampler = dict( - data_root=data_root, - info_path=data_root + 'nuscenes_dbinfos_train.pkl', - rate=1.0, - prepare=dict( - filter_by_difficulty=[-1], - filter_by_min_points=dict( - car=5, - truck=5, - bus=5, - trailer=5, - construction_vehicle=5, - traffic_cone=5, - barrier=5, - motorcycle=5, - bicycle=5, - pedestrian=5)), - classes=class_names, - sample_groups=dict( - car=2, - truck=3, - construction_vehicle=7, - bus=4, - trailer=6, - barrier=2, - motorcycle=6, - bicycle=6, - pedestrian=2, - traffic_cone=2), - points_loader=dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=[0, 1, 2, 3, 4], - backend_args=backend_args), - backend_args=backend_args) - -train_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - backend_args=backend_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - pad_empty_sweeps=True, - remove_close=True, - backend_args=backend_args), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), - dict(type='ObjectSample', db_sampler=db_sampler), - dict( - type='GlobalRotScaleTrans', - rot_range=[-0.3925, 0.3925], - scale_ratio_range=[0.95, 1.05], - translation_std=[0, 0, 0]), - dict( - type='RandomFlip3D', - sync_2d=False, - flip_ratio_bev_horizontal=0.5, - flip_ratio_bev_vertical=0.5), - dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='PointShuffle'), - dict( - type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - backend_args=backend_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - pad_empty_sweeps=True, - remove_close=True, - backend_args=backend_args), - dict( - type='MultiScaleFlipAug3D', - img_scale=(1333, 800), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict( - type='GlobalRotScaleTrans', - rot_range=[0, 0], - scale_ratio_range=[1., 1.], - translation_std=[0, 0, 0]), - dict(type='RandomFlip3D') - ]), - dict(type='Pack3DDetInputs', keys=['points']) -] - -train_dataloader = dict( - batch_size=4, - dataset=dict( - ann_file='nuscenes_infos_train.pkl', - backend_args=None, - box_type_3d='LiDAR', - data_prefix=dict( - img='', pts='samples/LIDAR_TOP', sweeps='sweeps/LIDAR_TOP'), - data_root='data/nuscenes/', - metainfo=dict(classes=[ - 'car', - 'truck', - 'construction_vehicle', - 'bus', - 'trailer', - 'barrier', - 'motorcycle', - 'bicycle', - 'pedestrian', - 'traffic_cone', - ]), - modality=dict(use_camera=False, use_lidar=True), - pipeline=[ - dict( - backend_args=None, - coord_type='LIDAR', - load_dim=5, - type='LoadPointsFromFile', - use_dim=5), - dict( - backend_args=None, - pad_empty_sweeps=True, - remove_close=True, - sweeps_num=9, - type='LoadPointsFromMultiSweeps', - use_dim=[ - 0, - 1, - 2, - 3, - 4, - ]), - dict( - type='LoadAnnotations3D', - with_bbox_3d=True, - with_label_3d=True), - dict( - rot_range=[ - -0.3925, - 0.3925, - ], - scale_ratio_range=[ - 0.95, - 1.05, - ], - translation_std=[ - 0, - 0, - 0, - ], - type='GlobalRotScaleTrans'), - dict( - flip_ratio_bev_horizontal=0.5, - flip_ratio_bev_vertical=0.5, - sync_2d=False, - type='RandomFlip3D'), - dict( - point_cloud_range=[ - -51.2, - -51.2, - -5.0, - 51.2, - 51.2, - 3.0, - ], - type='PointsRangeFilter'), - dict( - point_cloud_range=[ - -51.2, - -51.2, - -5.0, - 51.2, - 51.2, - 3.0, - ], - type='ObjectRangeFilter'), - dict( - classes=[ - 'car', - 'truck', - 'construction_vehicle', - 'bus', - 'trailer', - 'barrier', - 'motorcycle', - 'bicycle', - 'pedestrian', - 'traffic_cone', - ], - type='ObjectNameFilter'), - dict(type='PointShuffle'), - dict( - keys=[ - 'points', - 'gt_bboxes_3d', - 'gt_labels_3d', - ], - type='Pack3DDetInputs'), - ], - test_mode=False, - type='NuScenesDataset', - use_valid_flag=True), - num_workers=4, - persistent_workers=True, - sampler=dict(shuffle=True, type='DefaultSampler')) -test_dataloader = dict( - dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names))) -val_dataloader = dict( - dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names))) - -train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=20) diff --git a/configs/centerpoint/centerpoint_voxel01_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py b/configs/centerpoint/centerpoint_voxel01_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py deleted file mode 100644 index 6c423cbbc..000000000 --- a/configs/centerpoint/centerpoint_voxel01_squeeze_squeezefpn_8xb4-cyclic-20e_nus-3d.py +++ /dev/null @@ -1,160 +0,0 @@ -_base_ = [ - '../_base_/datasets/nus-3d.py', - '../_base_/models/centerpoint_voxel01_squeeze_squeezefpn_nus.py', - '../_base_/schedules/cyclic-20e.py', '../_base_/default_runtime.py' -] - -# If point cloud range is changed, the models should also change their point -# cloud range accordingly -point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] -# Using calibration info convert the Lidar-coordinate point cloud range to the -# ego-coordinate point cloud range could bring a little promotion in nuScenes. -# point_cloud_range = [-51.2, -52, -5.0, 51.2, 50.4, 3.0] -# For nuScenes we usually do 10-class detection -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' -] -data_prefix = dict(pts='samples/LIDAR_TOP', img='', sweeps='sweeps/LIDAR_TOP') -model = dict( - data_preprocessor=dict( - voxel_layer=dict(point_cloud_range=point_cloud_range)), - pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])), - # model training and testing settings - train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)), - test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2]))) - -dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes/' -backend_args = None - -db_sampler = dict( - data_root=data_root, - info_path=data_root + 'nuscenes_dbinfos_train.pkl', - rate=1.0, - prepare=dict( - filter_by_difficulty=[-1], - filter_by_min_points=dict( - car=5, - truck=5, - bus=5, - trailer=5, - construction_vehicle=5, - traffic_cone=5, - barrier=5, - motorcycle=5, - bicycle=5, - pedestrian=5)), - classes=class_names, - sample_groups=dict( - car=2, - truck=3, - construction_vehicle=7, - bus=4, - trailer=6, - barrier=2, - motorcycle=6, - bicycle=6, - pedestrian=2, - traffic_cone=2), - points_loader=dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=[0, 1, 2, 3, 4], - backend_args=backend_args), - backend_args=backend_args) - -train_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - backend_args=backend_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - pad_empty_sweeps=True, - remove_close=True, - backend_args=backend_args), - dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), - dict(type='ObjectSample', db_sampler=db_sampler), - dict( - type='GlobalRotScaleTrans', - rot_range=[-0.3925, 0.3925], - scale_ratio_range=[0.95, 1.05], - translation_std=[0, 0, 0]), - dict( - type='RandomFlip3D', - sync_2d=False, - flip_ratio_bev_horizontal=0.5, - flip_ratio_bev_vertical=0.5), - dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), - dict(type='ObjectNameFilter', classes=class_names), - dict(type='PointShuffle'), - dict( - type='Pack3DDetInputs', - keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) -] -test_pipeline = [ - dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=5, - backend_args=backend_args), - dict( - type='LoadPointsFromMultiSweeps', - sweeps_num=9, - use_dim=[0, 1, 2, 3, 4], - pad_empty_sweeps=True, - remove_close=True, - backend_args=backend_args), - dict( - type='MultiScaleFlipAug3D', - img_scale=(1333, 800), - pts_scale_ratio=1, - flip=False, - transforms=[ - dict( - type='GlobalRotScaleTrans', - rot_range=[0, 0], - scale_ratio_range=[1., 1.], - translation_std=[0, 0, 0]), - dict(type='RandomFlip3D'), - dict( - type='PointsRangeFilter', point_cloud_range=point_cloud_range) - ]), - dict(type='Pack3DDetInputs', keys=['points']) -] - -train_dataloader = dict( - _delete_=True, - batch_size=4, - num_workers=4, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type='CBGSDataset', - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='nuscenes_infos_train.pkl', - pipeline=train_pipeline, - metainfo=dict(classes=class_names), - test_mode=False, - data_prefix=data_prefix, - use_valid_flag=True, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR', - backend_args=backend_args))) -test_dataloader = dict( - dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names))) -val_dataloader = dict( - dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names))) - -train_cfg = dict(val_interval=20) From 121601ec4bd3b0ddb4e6f86ac2b7da58accead6f Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 14:32:07 -0700 Subject: [PATCH 10/12] Update SqueezeFPN_FireRPFNet_README.md --- configs/mvxnet/SqueezeFPN_FireRPFNet_README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md b/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md index 88348e612..025c22c2d 100644 --- a/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md +++ b/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md @@ -122,7 +122,8 @@ python demo/multi_modality_demo.py \ data/kitti/testing/image_2/000068.png \ data/kitti/kitti_infos_test.pkl \ work_dirs/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class.py \ - work_dirs/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class/epoch_25.pth``` + work_dirs/mvxnet_sqeezefpn_fire_rpfnet_kitti-3d-3class/epoch_25.pth +``` ### Pre-trained Model From d0bf46b6c3178d9e17724997ff67d8d2907fe203 Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 14:48:58 -0700 Subject: [PATCH 11/12] fix: classes --- configs/mvxnet/SqueezeFPN_FireRPFNet_README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md b/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md index 025c22c2d..fad4f1705 100644 --- a/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md +++ b/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md @@ -152,7 +152,7 @@ Key configuration details: - Backbone: SqueezeFPN + FireRPFNet - Training schedule: 25 epochs with cosine learning rate - Input: Multi-modal (LiDAR point cloud + Camera image) -- Classes: Car detection +- Classes: Pedestrian, Cyclist , Car - Voxel size: [0.05, 0.05, 0.1] - Point cloud range: [0, -40, -3, 70.4, 40, 1] From b41e56b1d90da472d26e27edf02757fd09140b19 Mon Sep 17 00:00:00 2001 From: Aravind Singh Bisht <166579628+aravindbisht@users.noreply.github.com> Date: Wed, 14 May 2025 14:49:34 -0700 Subject: [PATCH 12/12] Update SqueezeFPN_FireRPFNet_README.md --- configs/mvxnet/SqueezeFPN_FireRPFNet_README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md b/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md index fad4f1705..4f08adc2c 100644 --- a/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md +++ b/configs/mvxnet/SqueezeFPN_FireRPFNet_README.md @@ -1,4 +1,4 @@ -# Updated MVXNet with updated backbones(SqueezeFPN + FireRPFNet): Efficient Multi-Modal 3D Object Detection +# MVXNet with custom backbones(SqueezeFPN + FireRPFNet): Efficient Multi-Modal 3D Object Detection ## Abstract