|
| 1 | +"""Unit tests for CustomDINOHead.""" |
| 2 | +# Copyright (C) 2023 Intel Corporation |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | +# |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +import pytest |
| 8 | +import torch |
| 9 | +from mmcv.utils import ConfigDict |
| 10 | +from mmdet.core import build_assigner |
| 11 | +from mmdet.models.builder import build_detector |
| 12 | + |
| 13 | +from tests.test_suite.e2e_test_system import e2e_pytest_unit |
| 14 | + |
| 15 | + |
| 16 | +class TestCustomDINOHead: |
| 17 | + @pytest.fixture(autouse=True) |
| 18 | + def setup(self): |
| 19 | + torch.manual_seed(5) |
| 20 | + cfg = ConfigDict( |
| 21 | + dict( |
| 22 | + type="CustomDINOHead", |
| 23 | + num_query=900, |
| 24 | + num_classes=80, |
| 25 | + in_channels=2048, |
| 26 | + sync_cls_avg_factor=True, |
| 27 | + with_box_refine=True, |
| 28 | + as_two_stage=True, |
| 29 | + transformer=dict( |
| 30 | + type="CustomDINOTransformer", |
| 31 | + encoder=dict( |
| 32 | + type="DetrTransformerEncoder", |
| 33 | + num_layers=6, |
| 34 | + transformerlayers=dict( |
| 35 | + type="BaseTransformerLayer", |
| 36 | + attn_cfgs=dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0), |
| 37 | + feedforward_channels=2048, |
| 38 | + ffn_dropout=0.0, |
| 39 | + operation_order=("self_attn", "norm", "ffn", "norm"), |
| 40 | + ), |
| 41 | + ), |
| 42 | + decoder=dict( |
| 43 | + type="DINOTransformerDecoder", |
| 44 | + num_layers=6, |
| 45 | + return_intermediate=True, |
| 46 | + transformerlayers=dict( |
| 47 | + type="DetrTransformerDecoderLayer", |
| 48 | + attn_cfgs=[ |
| 49 | + dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.0), |
| 50 | + dict(type="MultiScaleDeformableAttention", embed_dims=256, dropout=0.0), |
| 51 | + ], |
| 52 | + feedforward_channels=2048, |
| 53 | + ffn_dropout=0.0, |
| 54 | + operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"), |
| 55 | + ), |
| 56 | + ), |
| 57 | + ), |
| 58 | + positional_encoding=dict( |
| 59 | + type="SinePositionalEncoding", num_feats=128, normalize=True, offset=0.0, temperature=20 |
| 60 | + ), |
| 61 | + loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), |
| 62 | + loss_bbox=dict(type="L1Loss", loss_weight=5.0), |
| 63 | + loss_iou=dict(type="GIoULoss", loss_weight=2.0), |
| 64 | + dn_cfg=dict( |
| 65 | + label_noise_scale=0.5, |
| 66 | + box_noise_scale=1.0, # 0.4 for DN-DETR |
| 67 | + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100), |
| 68 | + ), |
| 69 | + ), |
| 70 | + ) |
| 71 | + self.bbox_head = build_detector(cfg) |
| 72 | + |
| 73 | + assigner_cfg = ConfigDict( |
| 74 | + type="HungarianAssigner", |
| 75 | + cls_cost=dict(type="FocalLossCost", weight=1.0), |
| 76 | + reg_cost=dict(type="BBoxL1Cost", weight=5.0, box_format="xywh"), |
| 77 | + iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0), |
| 78 | + ) |
| 79 | + self.bbox_head.assigner = build_assigner(assigner_cfg) |
| 80 | + |
| 81 | + test_cfg = dict(max_per_img=300) |
| 82 | + self.bbox_head.test_cfg = test_cfg |
| 83 | + |
| 84 | + @e2e_pytest_unit |
| 85 | + def test_forward_train(self): |
| 86 | + inputs = [ |
| 87 | + torch.zeros([2, 256, 92, 95]), |
| 88 | + torch.zeros([2, 256, 46, 48]), |
| 89 | + torch.zeros([2, 256, 23, 24]), |
| 90 | + torch.zeros([2, 256, 12, 12]), |
| 91 | + ] |
| 92 | + gt_bboxes = [ |
| 93 | + torch.Tensor( |
| 94 | + [ |
| 95 | + [432.2500, 514.2661, 632.6323, 638.8889], |
| 96 | + [361.2484, 294.9931, 558.4751, 466.9410], |
| 97 | + [616.8542, 201.9204, 752.5462, 328.1207], |
| 98 | + [591.6091, 386.4883, 733.6124, 571.0562], |
| 99 | + [728.8790, 255.5556, 760.0000, 408.5734], |
| 100 | + [713.1008, 397.5309, 760.0000, 541.0837], |
| 101 | + [246.0680, 354.9383, 427.5165, 498.4911], |
| 102 | + [113.5316, 361.2483, 309.1805, 517.4211], |
| 103 | + [457.4950, 654.6639, 646.8326, 736.0000], |
| 104 | + [132.4654, 631.0014, 187.6889, 684.6365], |
| 105 | + [217.6673, 694.1015, 298.1358, 736.0000], |
| 106 | + [0.0000, 583.6763, 56.7303, 672.0164], |
| 107 | + [86.7088, 675.1714, 168.7551, 736.0000], |
| 108 | + [173.4885, 93.0727, 253.9570, 151.4403], |
| 109 | + [738.3458, 119.8903, 760.0000, 164.0603], |
| 110 | + [683.1224, 522.1536, 760.0000, 736.0000], |
| 111 | + ] |
| 112 | + ), |
| 113 | + torch.Tensor( |
| 114 | + [ |
| 115 | + [442.0, 279.0, 544.0, 377.0], |
| 116 | + [386.0, 1.0, 497.0, 108.0], |
| 117 | + [288.0, 1.0, 399.0, 84.0], |
| 118 | + [154.0, 1.0, 268.0, 77.0], |
| 119 | + [530.0, 163.0, 625.0, 248.0], |
| 120 | + [179.0, 298.0, 278.0, 398.0], |
| 121 | + [275.0, 320.0, 374.0, 420.0], |
| 122 | + [525.0, 394.0, 613.0, 480.0], |
| 123 | + [332.0, 160.0, 463.0, 286.0], |
| 124 | + [210.0, 395.0, 308.0, 480.0], |
| 125 | + [141.0, 395.0, 239.0, 480.0], |
| 126 | + [106.0, 225.0, 204.0, 310.0], |
| 127 | + [12.0, 1.0, 148.0, 70.0], |
| 128 | + [165.0, 79.0, 396.0, 247.0], |
| 129 | + [483.0, 13.0, 518.0, 52.0], |
| 130 | + ], |
| 131 | + ), |
| 132 | + ] |
| 133 | + gt_labels = [ |
| 134 | + torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2]).long(), |
| 135 | + torch.Tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0]).long(), |
| 136 | + ] |
| 137 | + img_metas = [ |
| 138 | + { |
| 139 | + "flip_direction": "horizontal", |
| 140 | + "img_shape": (736, 760, 3), |
| 141 | + "ori_shape": (480, 640, 3), |
| 142 | + "img_norm_cfg": { |
| 143 | + "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32), |
| 144 | + "std": np.array([58.395, 57.12, 57.375], dtype=np.float32), |
| 145 | + "to_rgb": False, |
| 146 | + }, |
| 147 | + "scale_factor": np.array([1.5139443, 1.5144033, 1.5139443, 1.5144033], dtype=np.float32), |
| 148 | + "flip": True, |
| 149 | + "pad_shape": (736, 760, 3), |
| 150 | + "batch_input_shape": (736, 760), |
| 151 | + }, |
| 152 | + { |
| 153 | + "flip_direction": "horizontal", |
| 154 | + "img_shape": (480, 640, 3), |
| 155 | + "ori_shape": (480, 640, 3), |
| 156 | + "img_norm_cfg": { |
| 157 | + "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32), |
| 158 | + "std": np.array([58.395, 57.12, 57.375], dtype=np.float32), |
| 159 | + "to_rgb": False, |
| 160 | + }, |
| 161 | + "scale_factor": np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float32), |
| 162 | + "flip": True, |
| 163 | + "pad_shape": (480, 640, 3), |
| 164 | + "batch_input_shape": (736, 760), |
| 165 | + }, |
| 166 | + ] |
| 167 | + losses = self.bbox_head.forward_train(inputs, img_metas, gt_bboxes, gt_labels) |
| 168 | + assert len(losses) == 39 |
| 169 | + |
| 170 | + @e2e_pytest_unit |
| 171 | + def test_simple_test_bboxes(self): |
| 172 | + feats = [ |
| 173 | + torch.zeros([2, 256, 100, 134]), |
| 174 | + torch.zeros([2, 256, 50, 67]), |
| 175 | + torch.zeros([2, 256, 25, 34]), |
| 176 | + torch.zeros([2, 256, 13, 17]), |
| 177 | + ] |
| 178 | + img_metas = [ |
| 179 | + { |
| 180 | + "ori_shape": (480, 640, 3), |
| 181 | + "img_shape": (800, 1067, 3), |
| 182 | + "pad_shape": (800, 1067, 3), |
| 183 | + "scale_factor": np.array([1.6671875, 1.6666666, 1.6671875, 1.6666666], dtype=np.float32), |
| 184 | + "flip": False, |
| 185 | + "flip_direction": None, |
| 186 | + "img_norm_cfg": { |
| 187 | + "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32), |
| 188 | + "std": np.array([58.395, 57.12, 57.375], dtype=np.float32), |
| 189 | + "to_rgb": False, |
| 190 | + }, |
| 191 | + "batch_input_shape": (800, 1067), |
| 192 | + }, |
| 193 | + { |
| 194 | + "ori_shape": (480, 640, 3), |
| 195 | + "img_shape": (800, 1067, 3), |
| 196 | + "pad_shape": (800, 1067, 3), |
| 197 | + "scale_factor": np.array([1.6671875, 1.6666666, 1.6671875, 1.6666666], dtype=np.float32), |
| 198 | + "flip": False, |
| 199 | + "flip_direction": None, |
| 200 | + "img_norm_cfg": { |
| 201 | + "mean": np.array([123.675, 116.28, 103.53], dtype=np.float32), |
| 202 | + "std": np.array([58.395, 57.12, 57.375], dtype=np.float32), |
| 203 | + "to_rgb": False, |
| 204 | + }, |
| 205 | + "batch_input_shape": (800, 1067), |
| 206 | + }, |
| 207 | + ] |
| 208 | + self.bbox_head.eval() |
| 209 | + results = self.bbox_head.simple_test_bboxes(feats, img_metas) |
| 210 | + assert len(results) == 2 |
| 211 | + assert results[0][0].shape == torch.Size([300, 5]) |
| 212 | + assert results[0][1].shape == torch.Size([300]) |
0 commit comments