diff --git a/.gitmodules b/.gitmodules index 3705ddf..484a9fd 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,9 @@ [submodule "examples/quantization_aware_training/imagenet1k/deit/deit"] path = examples/quantization_aware_training/imagenet1k/deit/deit url = https://github.com/facebookresearch/deit.git +[submodule "examples/post_training_quantization/coco2017/DETR/detr"] + path = examples/post_training_quantization/coco2017/DETR/detr + url = https://github.com/facebookresearch/detr.git +[submodule "examples/quantization_aware_training/coco2017/DETR/detr"] + path = examples/quantization_aware_training/coco2017/DETR/detr + url = https://github.com/facebookresearch/detr.git diff --git a/examples/post_training_quantization/coco2017/DETR/DETR_8w8f_visualization_mAP0399.svg b/examples/post_training_quantization/coco2017/DETR/DETR_8w8f_visualization_mAP0399.svg new file mode 100644 index 0000000..5e92200 --- /dev/null +++ b/examples/post_training_quantization/coco2017/DETR/DETR_8w8f_visualization_mAP0399.svg @@ -0,0 +1,4001 @@ + + + + + + +%3 + + + +87 + + +x.1 +[1, 3, 800, 1200] +FP32 NCHW + + + + + +0 + + + +Reformat (0 ms) + +QuantizeLinear_2 + + + + + +87->0 + + +[1, 3, 800, 1200] +FP32 NCHW + + + +88 + + +1707 +[1, 3, 800, 1200] +Int8 NCHW + + + + + +0->88 + + +[1, 3, 800, 1200] +Int8 NCHW + + + +1 + + + +ConvActPool (0 ms) + +backbone_backbone_body_conv1_bn.weight + + QuantizeLinear_6 + + Conv_8 + + Relu_9 + + MaxPool_10 + + + + + +88->1 + + +[1, 3, 800, 1200] +Int8 NCHW + + + +89 + + +1779 +[1, 64, 200, 300] +Int8 NC/32HW32 + + + + + +1->89 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +2 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_0_conv1_bn.weight + + QuantizeLinear_17 + + Conv_19 + + Relu_20 + + + + + +89->2 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +5 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_0_downsample_0.weight + + QuantizeLinear_46 + + Conv_48 + + Add_51 + + Relu_52 + + + + + +89->5 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +90 + + +1744 +[1, 64, 200, 300] +Int8 NC/32HW32 + + + + + +2->90 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +3 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_0_conv2_bn.weight + + QuantizeLinear_27 + + Conv_29 + + Relu_30 + + + + + +90->3 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +91 + + +1762 +[1, 64, 200, 300] +Int8 NC/32HW32 + + + + + +3->91 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +4 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_0_conv3_bn.weight + + QuantizeLinear_37 + + Conv_39 + + + + + +91->4 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +92 + + +1776 +[1, 256, 200, 300] +FP16 NC/32HW32 + + + + + +4->92 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +92->5 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +93 + + +1797 +[1, 256, 200, 300] +FP16 NC/32HW32 + + + + + +5->93 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +6 + + + +Reformat (0 ms) + +QuantizeLinear_55 + + + + + +93->6 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +9 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_1_conv3_bn.weight + + QuantizeLinear_79 + + Conv_81 + + Add_82 + + Relu_83 + + + + + +93->9 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +94 + + +1800 +[1, 256, 200, 300] +Int8 NC/32HW32 + + + + + +6->94 + + +[1, 256, 200, 300] +Int8 NC/32HW32 + + + +7 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_1_conv1_bn.weight + + QuantizeLinear_59 + + Conv_61 + + Relu_62 + + + + + +94->7 + + +[1, 256, 200, 300] +Int8 NC/32HW32 + + + +95 + + +1818 +[1, 64, 200, 300] +Int8 NC/32HW32 + + + + + +7->95 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +8 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_1_conv2_bn.weight + + QuantizeLinear_69 + + Conv_71 + + Relu_72 + + + + + +95->8 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +96 + + +1836 +[1, 64, 200, 300] +Int8 NC/32HW32 + + + + + +8->96 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +96->9 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +97 + + +1852 +[1, 256, 200, 300] +FP16 NC/32HW32 + + + + + +9->97 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +10 + + + +Reformat (0 ms) + +QuantizeLinear_86 + + + + + +97->10 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +13 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_2_conv3_bn.weight + + QuantizeLinear_110 + + Conv_112 + + Add_113 + + Relu_114 + + + + + +97->13 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +98 + + +1855 +[1, 256, 200, 300] +Int8 NC/32HW32 + + + + + +10->98 + + +[1, 256, 200, 300] +Int8 NC/32HW32 + + + +11 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_2_conv1_bn.weight + + QuantizeLinear_90 + + Conv_92 + + Relu_93 + + + + + +98->11 + + +[1, 256, 200, 300] +Int8 NC/32HW32 + + + +99 + + +1873 +[1, 64, 200, 300] +Int8 NC/32HW32 + + + + + +11->99 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +12 + + + +Convolution (0 ms) + +backbone_backbone_body_layer1_2_conv2_bn.weight + + QuantizeLinear_100 + + Conv_102 + + Relu_103 + + + + + +99->12 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +100 + + +1891 +[1, 64, 200, 300] +Int8 NC/32HW32 + + + + + +12->100 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +100->13 + + +[1, 64, 200, 300] +Int8 NC/32HW32 + + + +101 + + +1907 +[1, 256, 200, 300] +FP16 NC/32HW32 + + + + + +13->101 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +14 + + + +Reformat (0 ms) + +QuantizeLinear_146 + + + + + +101->14 + + +[1, 256, 200, 300] +FP16 NC/32HW32 + + + +102 + + +1963 +[1, 256, 200, 300] +Int8 NC/32HW32 + + + + + +14->102 + + +[1, 256, 200, 300] +Int8 NC/32HW32 + + + +15 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_0_conv1_bn.weight + + QuantizeLinear_121 + + Conv_123 + + Relu_124 + + + + + +102->15 + + +[1, 256, 200, 300] +Int8 NC/32HW32 + + + +18 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_0_downsample_0.weight + + QuantizeLinear_150 + + Conv_152 + + Add_155 + + Relu_156 + + + + + +102->18 + + +[1, 256, 200, 300] +Int8 NC/32HW32 + + + +103 + + +1928 +[1, 128, 200, 300] +Int8 NC/32HW32 + + + + + +15->103 + + +[1, 128, 200, 300] +Int8 NC/32HW32 + + + +16 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_0_conv2_bn.weight + + QuantizeLinear_131 + + Conv_133 + + Relu_134 + + + + + +103->16 + + +[1, 128, 200, 300] +Int8 NC/32HW32 + + + +104 + + +1946 +[1, 128, 100, 150] +Int8 NC/32HW32 + + + + + +16->104 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +17 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_0_conv3_bn.weight + + QuantizeLinear_141 + + Conv_143 + + + + + +104->17 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +105 + + +1960 +[1, 512, 100, 150] +FP16 NC/32HW32 + + + + + +17->105 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +105->18 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +106 + + +1981 +[1, 512, 100, 150] +FP16 NC/32HW32 + + + + + +18->106 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +19 + + + +Reformat (0 ms) + +QuantizeLinear_159 + + + + + +106->19 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +22 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_1_conv3_bn.weight + + QuantizeLinear_183 + + Conv_185 + + Add_186 + + Relu_187 + + + + + +106->22 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +107 + + +1984 +[1, 512, 100, 150] +Int8 NC/32HW32 + + + + + +19->107 + + +[1, 512, 100, 150] +Int8 NC/32HW32 + + + +20 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_1_conv1_bn.weight + + QuantizeLinear_163 + + Conv_165 + + Relu_166 + + + + + +107->20 + + +[1, 512, 100, 150] +Int8 NC/32HW32 + + + +108 + + +2002 +[1, 128, 100, 150] +Int8 NC/32HW32 + + + + + +20->108 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +21 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_1_conv2_bn.weight + + QuantizeLinear_173 + + Conv_175 + + Relu_176 + + + + + +108->21 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +109 + + +2020 +[1, 128, 100, 150] +Int8 NC/32HW32 + + + + + +21->109 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +109->22 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +110 + + +2036 +[1, 512, 100, 150] +FP16 NC/32HW32 + + + + + +22->110 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +23 + + + +Reformat (0 ms) + +QuantizeLinear_190 + + + + + +110->23 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +26 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_2_conv3_bn.weight + + QuantizeLinear_214 + + Conv_216 + + Add_217 + + Relu_218 + + + + + +110->26 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +111 + + +2039 +[1, 512, 100, 150] +Int8 NC/32HW32 + + + + + +23->111 + + +[1, 512, 100, 150] +Int8 NC/32HW32 + + + +24 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_2_conv1_bn.weight + + QuantizeLinear_194 + + Conv_196 + + Relu_197 + + + + + +111->24 + + +[1, 512, 100, 150] +Int8 NC/32HW32 + + + +112 + + +2057 +[1, 128, 100, 150] +Int8 NC/32HW32 + + + + + +24->112 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +25 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_2_conv2_bn.weight + + QuantizeLinear_204 + + Conv_206 + + Relu_207 + + + + + +112->25 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +113 + + +2075 +[1, 128, 100, 150] +Int8 NC/32HW32 + + + + + +25->113 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +113->26 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +114 + + +2091 +[1, 512, 100, 150] +FP16 NC/32HW32 + + + + + +26->114 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +27 + + + +Reformat (0 ms) + +QuantizeLinear_221 + + + + + +114->27 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +30 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_3_conv3_bn.weight + + QuantizeLinear_245 + + Conv_247 + + Add_248 + + Relu_249 + + + + + +114->30 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +115 + + +2094 +[1, 512, 100, 150] +Int8 NC/32HW32 + + + + + +27->115 + + +[1, 512, 100, 150] +Int8 NC/32HW32 + + + +28 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_3_conv1_bn.weight + + QuantizeLinear_225 + + Conv_227 + + Relu_228 + + + + + +115->28 + + +[1, 512, 100, 150] +Int8 NC/32HW32 + + + +116 + + +2112 +[1, 128, 100, 150] +Int8 NC/32HW32 + + + + + +28->116 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +29 + + + +Convolution (0 ms) + +backbone_backbone_body_layer2_3_conv2_bn.weight + + QuantizeLinear_235 + + Conv_237 + + Relu_238 + + + + + +116->29 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +117 + + +2130 +[1, 128, 100, 150] +Int8 NC/32HW32 + + + + + +29->117 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +117->30 + + +[1, 128, 100, 150] +Int8 NC/32HW32 + + + +118 + + +2146 +[1, 512, 100, 150] +FP16 NC/32HW32 + + + + + +30->118 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +31 + + + +Reformat (0 ms) + +QuantizeLinear_281 + + + + + +118->31 + + +[1, 512, 100, 150] +FP16 NC/32HW32 + + + +119 + + +2202 +[1, 512, 100, 150] +Int8 NC/32HW32 + + + + + +31->119 + + +[1, 512, 100, 150] +Int8 NC/32HW32 + + + +32 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_0_conv1_bn.weight + + QuantizeLinear_256 + + Conv_258 + + Relu_259 + + + + + +119->32 + + +[1, 512, 100, 150] +Int8 NC/32HW32 + + + +35 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_0_downsample_0.weight + + QuantizeLinear_285 + + Conv_287 + + Add_290 + + Relu_291 + + + + + +119->35 + + +[1, 512, 100, 150] +Int8 NC/32HW32 + + + +120 + + +2167 +[1, 256, 100, 150] +Int8 NC/32HW32 + + + + + +32->120 + + +[1, 256, 100, 150] +Int8 NC/32HW32 + + + +33 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_0_conv2_bn.weight + + QuantizeLinear_266 + + Conv_268 + + Relu_269 + + + + + +120->33 + + +[1, 256, 100, 150] +Int8 NC/32HW32 + + + +121 + + +2185 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +33->121 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +34 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_0_conv3_bn.weight + + QuantizeLinear_276 + + Conv_278 + + + + + +121->34 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +122 + + +2199 +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + + + +34->122 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +122->35 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +123 + + +2220 +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + + + +35->123 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +36 + + + +Reformat (0 ms) + +QuantizeLinear_294 + + + + + +123->36 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +39 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_1_conv3_bn.weight + + QuantizeLinear_318 + + Conv_320 + + Add_321 + + Relu_322 + + + + + +123->39 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +124 + + +2223 +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + + + +36->124 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +37 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_1_conv1_bn.weight + + QuantizeLinear_298 + + Conv_300 + + Relu_301 + + + + + +124->37 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +125 + + +2241 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +37->125 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +38 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_1_conv2_bn.weight + + QuantizeLinear_308 + + Conv_310 + + Relu_311 + + + + + +125->38 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +126 + + +2259 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +38->126 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +126->39 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +127 + + +2275 +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + + + +39->127 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +40 + + + +Reformat (0 ms) + +QuantizeLinear_325 + + + + + +127->40 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +43 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_2_conv3_bn.weight + + QuantizeLinear_349 + + Conv_351 + + Add_352 + + Relu_353 + + + + + +127->43 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +128 + + +2278 +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + + + +40->128 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +41 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_2_conv1_bn.weight + + QuantizeLinear_329 + + Conv_331 + + Relu_332 + + + + + +128->41 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +129 + + +2296 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +41->129 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +42 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_2_conv2_bn.weight + + QuantizeLinear_339 + + Conv_341 + + Relu_342 + + + + + +129->42 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +130 + + +2314 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +42->130 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +130->43 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +131 + + +2330 +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + + + +43->131 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +44 + + + +Reformat (0 ms) + +QuantizeLinear_356 + + + + + +131->44 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +47 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_3_conv3_bn.weight + + QuantizeLinear_380 + + Conv_382 + + Add_383 + + Relu_384 + + + + + +131->47 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +132 + + +2333 +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + + + +44->132 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +45 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_3_conv1_bn.weight + + QuantizeLinear_360 + + Conv_362 + + Relu_363 + + + + + +132->45 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +133 + + +2351 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +45->133 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +46 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_3_conv2_bn.weight + + QuantizeLinear_370 + + Conv_372 + + Relu_373 + + + + + +133->46 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +134 + + +2369 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +46->134 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +134->47 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +135 + + +2385 +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + + + +47->135 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +48 + + + +Reformat (0 ms) + +QuantizeLinear_387 + + + + + +135->48 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +51 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_4_conv3_bn.weight + + QuantizeLinear_411 + + Conv_413 + + Add_414 + + Relu_415 + + + + + +135->51 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +136 + + +2388 +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + + + +48->136 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +49 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_4_conv1_bn.weight + + QuantizeLinear_391 + + Conv_393 + + Relu_394 + + + + + +136->49 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +137 + + +2406 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +49->137 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +50 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_4_conv2_bn.weight + + QuantizeLinear_401 + + Conv_403 + + Relu_404 + + + + + +137->50 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +138 + + +2424 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +50->138 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +138->51 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +139 + + +2440 +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + + + +51->139 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +52 + + + +Reformat (0 ms) + +QuantizeLinear_418 + + + + + +139->52 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +55 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_5_conv3_bn.weight + + QuantizeLinear_442 + + Conv_444 + + Add_445 + + Relu_446 + + + + + +139->55 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +140 + + +2443 +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + + + +52->140 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +53 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_5_conv1_bn.weight + + QuantizeLinear_422 + + Conv_424 + + Relu_425 + + + + + +140->53 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +141 + + +2461 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +53->141 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +54 + + + +Convolution (0 ms) + +backbone_backbone_body_layer3_5_conv2_bn.weight + + QuantizeLinear_432 + + Conv_434 + + Relu_435 + + + + + +141->54 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +142 + + +2479 +[1, 256, 50, 75] +Int8 NC/32HW32 + + + + + +54->142 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +142->55 + + +[1, 256, 50, 75] +Int8 NC/32HW32 + + + +143 + + +2495 +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + + + +55->143 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +56 + + + +Reformat (0 ms) + +QuantizeLinear_478 + + + + + +143->56 + + +[1, 1024, 50, 75] +FP16 NC/32HW32 + + + +144 + + +2551 +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + + + +56->144 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +57 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_0_conv1_bn.weight + + QuantizeLinear_453 + + Conv_455 + + Relu_456 + + + + + +144->57 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +60 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_0_downsample_0.weight + + QuantizeLinear_482 + + Conv_484 + + Add_487 + + Relu_488 + + + + + +144->60 + + +[1, 1024, 50, 75] +Int8 NC/32HW32 + + + +145 + + +2516 +[1, 512, 50, 75] +Int8 NC/32HW32 + + + + + +57->145 + + +[1, 512, 50, 75] +Int8 NC/32HW32 + + + +58 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_0_conv2_bn.weight + + QuantizeLinear_463 + + Conv_465 + + Relu_466 + + + + + +145->58 + + +[1, 512, 50, 75] +Int8 NC/32HW32 + + + +146 + + +2534 +[1, 512, 25, 38] +Int8 NC/32HW32 + + + + + +58->146 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +59 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_0_conv3_bn.weight + + QuantizeLinear_473 + + Conv_475 + + + + + +146->59 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +147 + + +2548 +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + + + +59->147 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +147->60 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +148 + + +2569 +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + + + +60->148 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +61 + + + +Reformat (0 ms) + +QuantizeLinear_491 + + + + + +148->61 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +64 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_1_conv3_bn.weight + + QuantizeLinear_515 + + Conv_517 + + Add_518 + + Relu_519 + + + + + +148->64 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +149 + + +2572 +[1, 2048, 25, 38] +Int8 NC/32HW32 + + + + + +61->149 + + +[1, 2048, 25, 38] +Int8 NC/32HW32 + + + +62 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_1_conv1_bn.weight + + QuantizeLinear_495 + + Conv_497 + + Relu_498 + + + + + +149->62 + + +[1, 2048, 25, 38] +Int8 NC/32HW32 + + + +150 + + +2590 +[1, 512, 25, 38] +Int8 NC/32HW32 + + + + + +62->150 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +63 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_1_conv2_bn.weight + + QuantizeLinear_505 + + Conv_507 + + Relu_508 + + + + + +150->63 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +151 + + +2608 +[1, 512, 25, 38] +Int8 NC/32HW32 + + + + + +63->151 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +151->64 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +152 + + +2624 +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + + + +64->152 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +65 + + + +Reformat (0 ms) + +QuantizeLinear_522 + + + + + +152->65 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +68 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_2_conv3_bn.weight + + QuantizeLinear_546 + + Conv_548 + + Add_549 + + Relu_550 + + + + + +152->68 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +153 + + +2627 +[1, 2048, 25, 38] +Int8 NC/32HW32 + + + + + +65->153 + + +[1, 2048, 25, 38] +Int8 NC/32HW32 + + + +66 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_2_conv1_bn.weight + + QuantizeLinear_526 + + Conv_528 + + Relu_529 + + + + + +153->66 + + +[1, 2048, 25, 38] +Int8 NC/32HW32 + + + +154 + + +2645 +[1, 512, 25, 38] +Int8 NC/32HW32 + + + + + +66->154 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +67 + + + +Convolution (0 ms) + +backbone_backbone_body_layer4_2_conv2_bn.weight + + QuantizeLinear_536 + + Conv_538 + + Relu_539 + + + + + +154->67 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +155 + + +2663 +[1, 512, 25, 38] +Int8 NC/32HW32 + + + + + +67->155 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +155->68 + + +[1, 512, 25, 38] +Int8 NC/32HW32 + + + +156 + + +2679 +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + + + +68->156 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +69 + + + +Reformat (0 ms) + +Reformatting CopyNode for Input Tensor 0 to ForeignNode[(Unnamed Layer* 748) [Constant]...Slice_609] + + + + + +156->69 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +71 + + + +Reformat (0 ms) + +QuantizeLinear_684 + + + + + +156->71 + + +[1, 2048, 25, 38] +FP16 NC/32HW32 + + + +157 + + +Reformatted Input Tensor 0 to {ForeignNode[(Unnamed Layer* 748) [Constant]...Slice_609]} +[1, 2048, 25, 38] +FP32 NCHW + + + + + +69->157 + + +[1, 2048, 25, 38] +FP32 NCHW + + + +70 + + + +Myelin (0 ms) + +ForeignNode[(Unnamed Layer* 748) [Constant]...Slice_609] + + + + + +157->70 + + +[1, 2048, 25, 38] +FP32 NCHW + + + +158 + + +2787 +[1, 25, 38, 64] +FP32 NCHW + + + + + +70->158 + + +[1, 25, 38, 64] +FP32 NCHW + + + +159 + + +2775 +[1, 25, 38, 64] +FP32 NCHW + + + + + +70->159 + + +[1, 25, 38, 64] +FP32 NCHW + + + +160 + + +2750 +[1, 25, 38, 64] +FP32 NCHW + + + + + +70->160 + + +[1, 25, 38, 64] +FP32 NCHW + + + +161 + + +2738 +[1, 25, 38, 64] +FP32 NCHW + + + + + +70->161 + + +[1, 25, 38, 64] +FP32 NCHW + + + +82 + + + +Reformat (0 ms) + +QuantizeLinear_661 + + + + + +158->82 + + +[1, 25, 38, 64] +FP32 NCHW + + + +79 + + + +Reformat (0 ms) + +QuantizeLinear_649 + + + + + +159->79 + + +[1, 25, 38, 64] +FP32 NCHW + + + +76 + + + +Reformat (0 ms) + +QuantizeLinear_624 + + + + + +160->76 + + +[1, 25, 38, 64] +FP32 NCHW + + + +73 + + + +Reformat (0 ms) + +QuantizeLinear_612 + + + + + +161->73 + + +[1, 25, 38, 64] +FP32 NCHW + + + +162 + + +2813 +[1, 2048, 25, 38] +Int8 NC/32HW32 + + + + + +71->162 + + +[1, 2048, 25, 38] +Int8 NC/32HW32 + + + +72 + + + +Convolution (0 ms) + +input_proj.weight + + QuantizeLinear_688 + + Conv_690 + + + + + +162->72 + + +[1, 2048, 25, 38] +Int8 NC/32HW32 + + + +163 + + +2827 +[1, 256, 25, 38] +FP16 NC/32HW32 + + + + + +72->163 + + +[1, 256, 25, 38] +FP16 NC/32HW32 + + + +85 + + + +Reformat (0 ms) + +Reformatting CopyNode for Input Tensor 0 to ForeignNode[5861...Gather_2131] + + + + + +163->85 + + +[1, 256, 25, 38] +FP16 NC/32HW32 + + + +164 + + +2741 +[1, 25, 38, 64] +Int8 NC/4HW4 + + + + + +73->164 + + +[1, 25, 38, 64] +Int8 NC/4HW4 + + + +74 + + + +Reformat (0 ms) + +DequantizeLinear_615 + + + + + +164->74 + + +[1, 25, 38, 64] +Int8 NC/4HW4 + + + +165 + + +2744 +[1, 25, 38, 64] +FP32 NCHW + + + + + +74->165 + + +[1, 25, 38, 64] +FP32 NCHW + + + +75 + + + +PointWise (0 ms) + +PWN(Sin_616) + + + + + +165->75 + + +[1, 25, 38, 64] +FP32 NCHW + + + +166 + + +2745 +[1, 25, 38, 64] +FP32 NCHW + + + + + +75->166 + + +[1, 25, 38, 64] +FP32 NCHW + + + +86 + + + +Myelin (0 ms) + +ForeignNode[5861...Gather_2131] + + + + + +166->86 + + +[1, 25, 38, 64] +FP32 NCHW + + + +167 + + +2753 +[1, 25, 38, 64] +Int8 NC/4HW4 + + + + + +76->167 + + +[1, 25, 38, 64] +Int8 NC/4HW4 + + + +77 + + + +Reformat (0 ms) + +DequantizeLinear_627 + + + + + +167->77 + + +[1, 25, 38, 64] +Int8 NC/4HW4 + + + +168 + + +2756 +[1, 25, 38, 64] +FP32 NCHW + + + + + +77->168 + + +[1, 25, 38, 64] +FP32 NCHW + + + +78 + + + +PointWise (0 ms) + +PWN(Cos_628) + + + + + +168->78 + + +[1, 25, 38, 64] +FP32 NCHW + + + +169 + + +2757 +[1, 25, 38, 64] +FP32 NCHW + + + + + +78->169 + + +[1, 25, 38, 64] +FP32 NCHW + + + +169->86 + + +[1, 25, 38, 64] +FP32 NCHW + + + +170 + + +2778 +[1, 25, 38, 64] +Int8 NC/4HW4 + + + + + +79->170 + + +[1, 25, 38, 64] +Int8 NC/4HW4 + + + +80 + + + +Reformat (0 ms) + +DequantizeLinear_652 + + + + + +170->80 + + +[1, 25, 38, 64] +Int8 NC/4HW4 + + + +171 + + +2781 +[1, 25, 38, 64] +FP32 NCHW + + + + + +80->171 + + +[1, 25, 38, 64] +FP32 NCHW + + + +81 + + + +PointWise (0 ms) + +PWN(Sin_653) + + + + + +171->81 + + +[1, 25, 38, 64] +FP32 NCHW + + + +172 + + +2782 +[1, 25, 38, 64] +FP32 NCHW + + + + + +81->172 + + +[1, 25, 38, 64] +FP32 NCHW + + + +172->86 + + +[1, 25, 38, 64] +FP32 NCHW + + + +173 + + +2790 +[1, 25, 38, 64] +Int8 NC/4HW4 + + + + + +82->173 + + +[1, 25, 38, 64] +Int8 NC/4HW4 + + + +83 + + + +Reformat (0 ms) + +DequantizeLinear_664 + + + + + +173->83 + + +[1, 25, 38, 64] +Int8 NC/4HW4 + + + +174 + + +2793 +[1, 25, 38, 64] +FP32 NCHW + + + + + +83->174 + + +[1, 25, 38, 64] +FP32 NCHW + + + +84 + + + +PointWise (0 ms) + +PWN(Cos_665) + + + + + +174->84 + + +[1, 25, 38, 64] +FP32 NCHW + + + +175 + + +2794 +[1, 25, 38, 64] +FP32 NCHW + + + + + +84->175 + + +[1, 25, 38, 64] +FP32 NCHW + + + +175->86 + + +[1, 25, 38, 64] +FP32 NCHW + + + +176 + + +Reformatted Input Tensor 0 to {ForeignNode[5861...Gather_2131]} +[1, 256, 25, 38] +FP32 NCHW + + + + + +85->176 + + +[1, 256, 25, 38] +FP32 NCHW + + + +176->86 + + +[1, 256, 25, 38] +FP32 NCHW + + + +177 + + +5864 +[1, 100, 4] +FP32 NCHW + + + + + +86->177 + + +[1, 100, 4] +FP32 NCHW + + + +178 + + +5862 +[1, 100, 92] +FP32 NCHW + + + + + +86->178 + + +[1, 100, 92] +FP32 NCHW + + + diff --git a/examples/post_training_quantization/coco2017/DETR/README.md b/examples/post_training_quantization/coco2017/DETR/README.md new file mode 100644 index 0000000..c3a5196 --- /dev/null +++ b/examples/post_training_quantization/coco2017/DETR/README.md @@ -0,0 +1,29 @@ +# DETR PTQ example + +## preparation + +The `DETR` pretrained model is the checkpoint from https://github.com/facebookresearch/detr . The example will automatically download the checkpoint using `torch.hub.load`. + +The datasets used in this example are train dataset and validation dataset of COCO2017. They can be downloaded from http://cocodataset.org. also the relative cocoapi should be installed. + +## Usage + +```shell +python3 main.py qconfig.yaml --coco_path /path/to/coco +``` +Since mask is not well supported by onnx, we removed mask-related codes and assign the batch size to be 1 only. Dynamic_axes for onnx is also not supported yet. + +## Metrics + +|DETR-R50|mAPc|AP50|AP75| remarks| +|-|-|-|-|-| +|float|0.421 | 0.623 | 0.443 | baseline +|8w8f|0.332|0.588|0.320| minmax observer| +|8w8f|0.404|0.612|0.421| minmax observer, float w&f for last 2 bbox embed layers| +|8w8f|0.384|0.598|0.402| minmax observer, apply aciq laplace observer for last bbox embed layer| +|8w8f|0.398|0.609|0.420| minmax observer, apply aciq laplace observer for last 2 bbox embed layer| + +TRT DETR w/ fixed input shape, enable int8&fp16 QPS: 118.334 on Nvidia 2080Ti. For detailed visualization, please refer to +```shell +examples/post_training_quantization/coco2017/DETR/DETR_8w8f_visualization_mAP0395.svg +``` \ No newline at end of file diff --git a/examples/post_training_quantization/coco2017/DETR/detr b/examples/post_training_quantization/coco2017/DETR/detr new file mode 160000 index 0000000..8a144f8 --- /dev/null +++ b/examples/post_training_quantization/coco2017/DETR/detr @@ -0,0 +1 @@ +Subproject commit 8a144f83a287f4d3fece4acdf073f387c5af387d diff --git a/examples/post_training_quantization/coco2017/DETR/evaluation.py b/examples/post_training_quantization/coco2017/DETR/evaluation.py new file mode 100644 index 0000000..ca812f7 --- /dev/null +++ b/examples/post_training_quantization/coco2017/DETR/evaluation.py @@ -0,0 +1,95 @@ +import torch +import os + +import util.misc as utils +from datasets.coco_eval import CocoEvaluator +from datasets.panoptic_eval import PanopticEvaluator + + + + +@torch.no_grad() +def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, output_dir): + model.eval() + criterion.eval() + + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) + header = 'Test:' + + iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys()) + coco_evaluator = CocoEvaluator(base_ds, iou_types) + # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75] + + panoptic_evaluator = None + if 'panoptic' in postprocessors.keys(): + panoptic_evaluator = PanopticEvaluator( + data_loader.dataset.ann_file, + data_loader.dataset.ann_folder, + output_dir=os.path.join(output_dir, "panoptic_eval"), + ) + + for samples, targets in metric_logger.log_every(data_loader, 10, header): + sample = samples.tensors.to(device) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + + outputs = model(sample) + loss_dict = criterion(outputs, targets) + weight_dict = criterion.weight_dict + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = utils.reduce_dict(loss_dict) + loss_dict_reduced_scaled = {k: v * weight_dict[k] + for k, v in loss_dict_reduced.items() if k in weight_dict} + loss_dict_reduced_unscaled = {f'{k}_unscaled': v + for k, v in loss_dict_reduced.items()} + metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()), + **loss_dict_reduced_scaled, + **loss_dict_reduced_unscaled) + metric_logger.update(class_error=loss_dict_reduced['class_error']) + + orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) + results = postprocessors['bbox'](outputs, orig_target_sizes) + if 'segm' in postprocessors.keys(): + target_sizes = torch.stack([t["size"] for t in targets], dim=0) + results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes) + res = {target['image_id'].item(): output for target, output in zip(targets, results)} + if coco_evaluator is not None: + coco_evaluator.update(res) + + if panoptic_evaluator is not None: + res_pano = postprocessors["panoptic"](outputs, target_sizes, orig_target_sizes) + for i, target in enumerate(targets): + image_id = target["image_id"].item() + file_name = f"{image_id:012d}.png" + res_pano[i]["image_id"] = image_id + res_pano[i]["file_name"] = file_name + + panoptic_evaluator.update(res_pano) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + if coco_evaluator is not None: + coco_evaluator.synchronize_between_processes() + if panoptic_evaluator is not None: + panoptic_evaluator.synchronize_between_processes() + + # accumulate predictions from all images + if coco_evaluator is not None: + coco_evaluator.accumulate() + coco_evaluator.summarize() + panoptic_res = None + if panoptic_evaluator is not None: + panoptic_res = panoptic_evaluator.summarize() + stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} + if coco_evaluator is not None: + if 'bbox' in postprocessors.keys(): + stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist() + if 'segm' in postprocessors.keys(): + stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist() + if panoptic_res is not None: + stats['PQ_all'] = panoptic_res["All"] + stats['PQ_th'] = panoptic_res["Things"] + stats['PQ_st'] = panoptic_res["Stuff"] + return stats, coco_evaluator diff --git a/examples/post_training_quantization/coco2017/DETR/main.py b/examples/post_training_quantization/coco2017/DETR/main.py new file mode 100644 index 0000000..c851f43 --- /dev/null +++ b/examples/post_training_quantization/coco2017/DETR/main.py @@ -0,0 +1,177 @@ +import argparse +import torch +import torch.nn as nn +import torch.nn.parallel +import torch.backends.cudnn as cudnn +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +import detr.util.misc as utils +import sys +sys.path.append("./detr") +from detr.datasets import get_coco_api_from_dataset +from val_transform_datasets import build_dataset +from model import build +import onnx +import onnx_graphsurgeon as gs + +from sparsebit.quantization import QuantModel, parse_qconfig + +from evaluation import evaluate + +parser = argparse.ArgumentParser(description="PyTorch ImageNet Training") +parser.add_argument("qconfig", help="the path of quant config") +parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="deit_tiny_patch16_224", + help="ViT model architecture. (default: deit_tiny)", +) +parser.add_argument( + "-j", + "--num_workers", + default=2, + type=int, + metavar="N", + help="number of data loading workers (default: 4)", +) +parser.add_argument( + "-b", + "--batch-size", + default=1, + type=int, + metavar="N", + help="mini-batch size (default: 64), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", +) +parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", +) + +# * Backbone +parser.add_argument('--backbone', default='resnet50', type=str, + help="Name of the convolutional backbone to use") +parser.add_argument('--dilation', action='store_true', + help="If true, we replace stride with dilation in the last convolutional block (DC5)") +parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'), + help="Type of positional embedding to use on top of the image features") + + +# * Transformer +parser.add_argument('--enc_layers', default=6, type=int, + help="Number of encoding layers in the transformer") +parser.add_argument('--dec_layers', default=6, type=int, + help="Number of decoding layers in the transformer") +parser.add_argument('--dim_feedforward', default=2048, type=int, + help="Intermediate size of the feedforward layers in the transformer blocks") +parser.add_argument('--hidden_dim', default=256, type=int, + help="Size of the embeddings (dimension of the transformer)") +parser.add_argument('--dropout', default=0.1, type=float, + help="Dropout applied in the transformer") +parser.add_argument('--nheads', default=8, type=int, + help="Number of attention heads inside the transformer's attentions") +parser.add_argument('--num_queries', default=100, type=int, + help="Number of query slots") +parser.add_argument('--pre_norm', action='store_true') + +# Loss +parser.add_argument('--aux_loss', dest='aux_loss', action='store_true', + help="Enables auxiliary decoding losses (loss at each layer)") +# * Matcher +parser.add_argument('--set_cost_class', default=1, type=float, + help="Class coefficient in the matching cost") +parser.add_argument('--set_cost_bbox', default=5, type=float, + help="L1 box coefficient in the matching cost") +parser.add_argument('--set_cost_giou', default=2, type=float, + help="giou box coefficient in the matching cost") +# * Loss coefficients +parser.add_argument('--mask_loss_coef', default=1, type=float) +parser.add_argument('--dice_loss_coef', default=1, type=float) +parser.add_argument('--bbox_loss_coef', default=5, type=float) +parser.add_argument('--giou_loss_coef', default=2, type=float) +parser.add_argument('--eos_coef', default=0.1, type=float, + help="Relative classification weight of the no-object class") + +#configs for coco dataset +parser.add_argument('--dataset_file', default='coco') +parser.add_argument('--coco_path', type=str) +parser.add_argument('--masks', action='store_true', + help="Train segmentation head if the flag is provided") +parser.add_argument('--output_dir', default='', + help='path where to save, empty for no saving') + +parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + +def main(): + args = parser.parse_args() + device = args.device + + # get pretrained model from https://github.com/facebookresearch/detr + model = torch.hub.load('facebookresearch/detr:main', 'detr_resnet50', pretrained=True) + model, criterion, postprocessors = build(args, model) + + qconfig = parse_qconfig(args.qconfig) + qmodel = QuantModel(model, config=qconfig).to(device) + + cudnn.benchmark = True + + dataset_val = build_dataset(image_set='val', args=args) + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + data_loader_val = torch.utils.data.DataLoader(dataset_val, args.batch_size, sampler=sampler_val, + drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) + base_ds = get_coco_api_from_dataset(dataset_val) + + dataset_calib = build_dataset(image_set='train', args=args) + sampler_calib = torch.utils.data.RandomSampler(dataset_calib) + data_loader_calib = torch.utils.data.DataLoader(dataset_calib, args.batch_size, sampler=sampler_calib, + drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) + + + qmodel.eval() + with torch.no_grad(): + qmodel.prepare_calibration() + # forward calibration-set + calibration_size = 16 + cur_size = 0 + for samples, _ in data_loader_calib: + sample = samples.tensors.to(device) + qmodel(sample) + cur_size += args.batch_size + if cur_size >= calibration_size: + break + qmodel.calc_qparams() + qmodel.set_quant(w_quant=True, a_quant=True) + + test_stats, coco_evaluator = evaluate(qmodel, criterion, postprocessors, + data_loader_val, base_ds, device, args.output_dir) + + qmodel.export_onnx(torch.randn(1, 3, 800, 1200), name="qDETR.onnx") + + # graph = gs.import_onnx(onnx.load("qDETR.onnx")) + # Reshapes = [node for node in graph.nodes if node.op == "Reshape"] + # for node in Reshapes: + # if isinstance(node.inputs[1], gs.Constant): + # if node.inputs[1].values[1]==7600: + # node.inputs[1].values[1] = 8 + # elif node.inputs[1].values[1]==950: + # node.inputs[1].values[1] = 1 + # elif node.inputs[1].values[1]==100: + # node.inputs[1].values[1] = 1 + # elif node.inputs[1].values[1]==800: + # node.inputs[1].values[1] = 8 + + # onnx.save(gs.export_onnx(graph), "qDETR.onnx") + + + + +if __name__ == "__main__": + main() diff --git a/examples/post_training_quantization/coco2017/DETR/model.py b/examples/post_training_quantization/coco2017/DETR/model.py new file mode 100644 index 0000000..f9eff0a --- /dev/null +++ b/examples/post_training_quantization/coco2017/DETR/model.py @@ -0,0 +1,378 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from detr.util import box_ops +from detr.util.misc import nested_tensor_from_tensor_list, accuracy, get_world_size, interpolate, is_dist_avail_and_initialized +from detr.models.matcher import build_matcher + + +class PositionEmbeddingSine(nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + dim_t =torch.arange(model.num_pos_feats, dtype=torch.float32) + self.dim_t = nn.Parameter(model.temperature ** (2 * (dim_t // 2) / model.num_pos_feats)) + + def forward(self, x): + not_mask = (x*0).sum(1,keepdims=False)+1 + y_embed = not_mask.cumsum(1) + x_embed = not_mask.cumsum(2) + if self.model.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.model.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.model.scale + + pos_x = x_embed[:, :, :, None] / self.dim_t + pos_y = y_embed[:, :, :, None] / self.dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + +class Backbone(nn.Module): + def __init__(self, model): + super().__init__() + self.backbone = model[0] + self.pos_embed = PositionEmbeddingSine(model[1]) + new_bns = {} + for n, m in self.backbone.body.named_modules(): + if "bn" in n or "downsample.1" in n: + new_bn = nn.BatchNorm2d(m.weight.shape[0]) + new_bn.weight.data = m.weight + new_bn.bias.data = m.bias + new_bn.running_mean.data = m.running_mean + new_bn.running_var.data = m.running_var + new_bns[n] = new_bn + for n, m in new_bns.items(): + splited_name = n.split(".") + if len(splited_name) == 1: + setattr(self.backbone.body, n, m) + elif len(splited_name) == 3:#layera.b.bn + layer = splited_name[0] + idx = int(splited_name[1]) + name = splited_name[2] + setattr(getattr(self.backbone.body, layer)[idx], name, m) + elif len(splited_name) == 5: #downsample + layer = splited_name[0] + idx = int(splited_name[1]) + name = splited_name[2] + setattr(getattr(self.backbone.body, layer)[idx], name, nn.Sequential(getattr(getattr(self.backbone.body, layer)[idx], name)[0], m)) + + + + def forward(self, samples): + xs = self.backbone.body(samples) + out = [] + pos = [] + for name, x in xs.items(): + out.append(x) + pos.append(self.pos_embed(x).float()) + return out, pos + +class Transformer(nn.Module): + + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, src, query_embed, pos_embed): + # flatten NxCxHxW to HWxNxC + bs, c, h, w = src.shape + src = src.flatten(2).permute(2, 0, 1) + pos_embed = pos_embed.flatten(2).permute(2, 0, 1) + query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) + + tgt = torch.zeros_like(query_embed) + memory = self.model.encoder(src, pos=pos_embed) + hs = self.model.decoder(tgt, memory, + pos=pos_embed, query_pos=query_embed) + return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) + +class DETR_Impl(nn.Module): + """ This is the DETR module that performs object detection """ + def __init__(self, model): + """ Initializes the model. + Parameters: + backbone: torch module of the backbone to be used. See backbone.py + transformer: torch module of the transformer architecture. See transformer.py + num_classes: number of object classes + num_queries: number of object queries, ie detection slot. This is the maximal number of objects + DETR can detect in a single image. For COCO, we recommend 100 queries. + aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + """ + super().__init__() + self.input_proj = model.input_proj + self.query_embed = model.query_embed + self.class_embed = model.class_embed + self.bbox_embed = model.bbox_embed + self.aux_loss = model.aux_loss + self.backbone = Backbone(model.backbone) + self.transformer = Transformer(model.transformer) + + def forward(self, samples): + features, pos = self.backbone(samples) + + src = features[-1] + hs = self.transformer(self.input_proj(src), self.query_embed.weight, pos[-1])[0] + + outputs_class = self.class_embed(hs) + outputs_coord = self.bbox_embed(hs).sigmoid() + out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]} + if self.aux_loss: + out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) + return out + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{'pred_logits': a, 'pred_boxes': b} + for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + +class SetCriterion(nn.Module): + """ This class computes the loss for DETR. + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + """ + def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses): + """ Create the criterion. + Parameters: + num_classes: number of object categories, omitting the special no-object category + matcher: module able to compute a matching between targets and proposals + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category + losses: list of all the losses to be applied. See get_loss for list of available losses. + """ + super().__init__() + self.num_classes = num_classes + self.matcher = matcher + self.weight_dict = weight_dict + self.eos_coef = eos_coef + self.losses = losses + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = self.eos_coef + self.register_buffer('empty_weight', empty_weight) + + def loss_labels(self, outputs, targets, indices, num_boxes, log=True): + """Classification loss (NLL) + targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] + """ + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + + loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) + losses = {'loss_ce': loss_ce} + + if log: + # TODO this should probably be a separate loss, not hacked in this one here + losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0] + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients + """ + pred_logits = outputs['pred_logits'] + device = pred_logits.device + tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1) + card_err = F.l1_loss(card_pred.float(), tgt_lengths.float()) + losses = {'cardinality_error': card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss + targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] + The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') + + losses = {} + losses['loss_bbox'] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag(box_ops.generalized_box_iou( + box_ops.box_cxcywh_to_xyxy(src_boxes), + box_ops.box_cxcywh_to_xyxy(target_boxes))) + losses['loss_giou'] = loss_giou.sum() / num_boxes + return losses + + def loss_masks(self, outputs, targets, indices, num_boxes): + """Compute the losses related to the masks: the focal loss and the dice loss. + targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] + """ + assert "pred_masks" in outputs + + src_idx = self._get_src_permutation_idx(indices) + tgt_idx = self._get_tgt_permutation_idx(indices) + src_masks = outputs["pred_masks"] + src_masks = src_masks[src_idx] + masks = [t["masks"] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(src_masks) + target_masks = target_masks[tgt_idx] + + # upsample predictions to the target size + src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:], + mode="bilinear", align_corners=False) + src_masks = src_masks[:, 0].flatten(1) + + target_masks = target_masks.flatten(1) + target_masks = target_masks.view(src_masks.shape) + losses = { + "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), + "loss_dice": dice_loss(src_masks, target_masks, num_boxes), + } + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): + loss_map = { + 'labels': self.loss_labels, + 'cardinality': self.loss_cardinality, + 'boxes': self.loss_boxes, + 'masks': self.loss_masks + } + assert loss in loss_map, f'do you really want to compute {loss} loss?' + return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) + + def forward(self, outputs, targets): + """ This performs the loss computation. + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + if is_dist_avail_and_initialized(): + torch.distributed.all_reduce(num_boxes) + num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if 'aux_outputs' in outputs: + for i, aux_outputs in enumerate(outputs['aux_outputs']): + indices = self.matcher(aux_outputs, targets) + for loss in self.losses: + if loss == 'masks': + # Intermediate masks losses are too costly to compute, we ignore them. + continue + kwargs = {} + if loss == 'labels': + # Logging is enabled only for the last layer + kwargs = {'log': False} + l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) + l_dict = {k + f'_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +class PostProcess(nn.Module): + """ This module converts the model's output into the format expected by the coco api""" + @torch.no_grad() + def forward(self, outputs, target_sizes): + """ Perform the computation + Parameters: + outputs: raw outputs of the model + target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch + For evaluation, this must be the original image size (before any data augmentation) + For visualization, this should be the image size after data augment, but before padding + """ + out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] + + assert len(out_logits) == len(target_sizes) + assert target_sizes.shape[1] == 2 + + prob = F.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # convert to [x0, y0, x1, y1] format + boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)] + + return results + + +def build(args, model): + # the `num_classes` naming here is somewhat misleading. + # it indeed corresponds to `max_obj_id + 1`, where max_obj_id + # is the maximum id for a class in your dataset. For example, + # COCO has a max_obj_id of 90, so we pass `num_classes` to be 91. + # As another example, for a dataset that has a single class with id 1, + # you should pass `num_classes` to be 2 (max_obj_id + 1). + # For more details on this, check the following discussion + # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223 + num_classes = 20 if args.dataset_file != 'coco' else 91 + if args.dataset_file == "coco_panoptic": + # for panoptic, we just add a num_classes that is large enough to hold + # max_obj_id + 1, but the exact value doesn't really matter + num_classes = 250 + device = torch.device(args.device) + + model = DETR_Impl(model) + + matcher = build_matcher(args) + weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef} + weight_dict['loss_giou'] = args.giou_loss_coef + if args.masks: + weight_dict["loss_mask"] = args.mask_loss_coef + weight_dict["loss_dice"] = args.dice_loss_coef + # TODO this is a hack + if args.aux_loss: + aux_weight_dict = {} + for i in range(args.dec_layers - 1): + aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + + losses = ['labels', 'boxes', 'cardinality'] + criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, + eos_coef=args.eos_coef, losses=losses) + criterion.to(device) + postprocessors = {'bbox': PostProcess()} + + return model, criterion, postprocessors diff --git a/examples/post_training_quantization/coco2017/DETR/qconfig.yaml b/examples/post_training_quantization/coco2017/DETR/qconfig.yaml new file mode 100644 index 0000000..dcbf046 --- /dev/null +++ b/examples/post_training_quantization/coco2017/DETR/qconfig.yaml @@ -0,0 +1,22 @@ +BACKEND: tensorrt +SCHEDULE: + FUSE_BN: True +W: + QSCHEME: per-channel-symmetric + QUANTIZER: + TYPE: uniform + BIT: 8 + OBSERVER: + TYPE: MINMAX + SPECIFIC: [{ + "bbox_embed_layers_1": ["OBSERVER.TYPE", "aciq", "OBSERVER.ACIQ.DISTRIBUTION", "laplace"], + "bbox_embed_layers_2": ["OBSERVER.TYPE", "aciq", "OBSERVER.ACIQ.DISTRIBUTION", "laplace"] + }] +A: + QSCHEME: per-tensor-symmetric + QUANTIZER: + TYPE: uniform + BIT: 8 + OBSERVER: + TYPE: MINMAX + LAYOUT: NCHW diff --git a/examples/post_training_quantization/coco2017/DETR/val_transform_datasets.py b/examples/post_training_quantization/coco2017/DETR/val_transform_datasets.py new file mode 100644 index 0000000..77287c9 --- /dev/null +++ b/examples/post_training_quantization/coco2017/DETR/val_transform_datasets.py @@ -0,0 +1,20 @@ +from pathlib import Path +from detr.datasets.coco import CocoDetection, make_coco_transforms + +def build(image_set, args): + root = Path(args.coco_path) + assert root.exists(), f'provided COCO path {root} does not exist' + mode = 'instances' + PATHS = { + "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'), + "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'), + } + + img_folder, ann_file = PATHS[image_set] + dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms("val"), return_masks=args.masks) + return dataset + +def build_dataset(image_set, args): + if args.dataset_file == 'coco': + return build(image_set, args) + raise ValueError(f'dataset {args.dataset_file} not supported') \ No newline at end of file diff --git a/examples/quantization_aware_training/coco2017/DETR/README.md b/examples/quantization_aware_training/coco2017/DETR/README.md new file mode 100644 index 0000000..efb1f2b --- /dev/null +++ b/examples/quantization_aware_training/coco2017/DETR/README.md @@ -0,0 +1,20 @@ +# DETR QAT example + +## preparation + +The `DETR` pretrained model is the checkpoint from https://github.com/facebookresearch/detr . The example will automatically download the checkpoint using `torch.hub.load`. + +The datasets used in this example are train dataset and validation dataset of COCO2017. They can be downloaded from http://cocodataset.org. also the relative cocoapi should be installed. + +## Usage + +```shell +python -m torch.distributed.launch --nproc_per_node=8 --use_env main.py qconfig_lsq_8w8f.yaml --coco_path /path/to/coco +``` + +## Metrics + +|DETR-R50|mAPc|AP50|AP75| remarks| +|-|-|-|-|-| +|Float|0.421|0.623|0.443|baseline| +|8w8f| diff --git a/examples/quantization_aware_training/coco2017/DETR/detr b/examples/quantization_aware_training/coco2017/DETR/detr new file mode 160000 index 0000000..8a144f8 --- /dev/null +++ b/examples/quantization_aware_training/coco2017/DETR/detr @@ -0,0 +1 @@ +Subproject commit 8a144f83a287f4d3fece4acdf073f387c5af387d diff --git a/examples/quantization_aware_training/coco2017/DETR/main.py b/examples/quantization_aware_training/coco2017/DETR/main.py new file mode 100644 index 0000000..1cf2851 --- /dev/null +++ b/examples/quantization_aware_training/coco2017/DETR/main.py @@ -0,0 +1,278 @@ +import argparse +import datetime +import json +import random +import time +from pathlib import Path + +import numpy as np +from sparsebit.quantization.modules.conv import QConv2d +import torch +from torch.utils.data import DataLoader, DistributedSampler + +import sys +sys.path.append("./detr") +import detr.util.misc as utils +from detr.datasets import build_dataset, get_coco_api_from_dataset, coco +from detr.engine import evaluate, train_one_epoch +from model import build + +from sparsebit.quantization import QuantModel, parse_qconfig + +def get_args_parser(): + parser = argparse.ArgumentParser('Set transformer detector', add_help=False) + parser.add_argument("qconfig", help="the path of quant config") + parser.add_argument('--lr', default=1e-4, type=float) + parser.add_argument('--lr_backbone', default=1e-5, type=float) + parser.add_argument('--batch_size', default=1, type=int) + parser.add_argument('--weight_decay', default=1e-4, type=float) + parser.add_argument('--epochs', default=300, type=int) + parser.add_argument('--lr_drop', default=200, type=int) + parser.add_argument('--clip_max_norm', default=0.1, type=float, + help='gradient clipping max norm') + + # Model parameters + parser.add_argument('--frozen_weights', type=str, default=None, + help="Path to the pretrained model. If set, only the mask head will be trained") + # * Backbone + parser.add_argument('--backbone', default='resnet50', type=str, + help="Name of the convolutional backbone to use") + parser.add_argument('--dilation', action='store_true', + help="If true, we replace stride with dilation in the last convolutional block (DC5)") + parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'), + help="Type of positional embedding to use on top of the image features") + + # * Transformer + parser.add_argument('--enc_layers', default=6, type=int, + help="Number of encoding layers in the transformer") + parser.add_argument('--dec_layers', default=6, type=int, + help="Number of decoding layers in the transformer") + parser.add_argument('--dim_feedforward', default=2048, type=int, + help="Intermediate size of the feedforward layers in the transformer blocks") + parser.add_argument('--hidden_dim', default=256, type=int, + help="Size of the embeddings (dimension of the transformer)") + parser.add_argument('--dropout', default=0.1, type=float, + help="Dropout applied in the transformer") + parser.add_argument('--nheads', default=8, type=int, + help="Number of attention heads inside the transformer's attentions") + parser.add_argument('--num_queries', default=100, type=int, + help="Number of query slots") + parser.add_argument('--pre_norm', action='store_true') + + # * Segmentation + parser.add_argument('--masks', action='store_true', + help="Train segmentation head if the flag is provided") + + # Loss + parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false', + help="Disables auxiliary decoding losses (loss at each layer)") + # * Matcher + parser.add_argument('--set_cost_class', default=1, type=float, + help="Class coefficient in the matching cost") + parser.add_argument('--set_cost_bbox', default=5, type=float, + help="L1 box coefficient in the matching cost") + parser.add_argument('--set_cost_giou', default=2, type=float, + help="giou box coefficient in the matching cost") + # * Loss coefficients + parser.add_argument('--mask_loss_coef', default=1, type=float) + parser.add_argument('--dice_loss_coef', default=1, type=float) + parser.add_argument('--bbox_loss_coef', default=5, type=float) + parser.add_argument('--giou_loss_coef', default=2, type=float) + parser.add_argument('--eos_coef', default=0.1, type=float, + help="Relative classification weight of the no-object class") + + # dataset parameters + parser.add_argument('--dataset_file', default='coco') + parser.add_argument('--coco_path', type=str) + parser.add_argument('--coco_panoptic_path', type=str) + parser.add_argument('--remove_difficult', action='store_true') + + parser.add_argument('--output_dir', default='', + help='path where to save, empty for no saving') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=42, type=int) + parser.add_argument('--resume', default='', help='resume from checkpoint') + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--eval', action='store_true') + parser.add_argument('--num_workers', default=2, type=int) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') + return parser + + +def main(args): + utils.init_distributed_mode(args) + print("git:\n {}\n".format(utils.get_sha())) + + if args.frozen_weights is not None: + assert args.masks, "Frozen training is meant for segmentation only" + print(args) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + utils.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + + model = torch.hub.load('facebookresearch/detr:main', 'detr_resnet50', pretrained=True) + model, criterion, postprocessors = build(args, model) + + qconfig = parse_qconfig(args.qconfig) + qmodel = QuantModel(model, config=qconfig).cuda() + + dataset_train = build_dataset(image_set='train', args=args) + dataset_val = build_dataset(image_set='val', args=args) + dataset_calib = build_dataset(image_set='train', args=args) + + if args.distributed: + sampler_train = DistributedSampler(dataset_train) + sampler_val = DistributedSampler(dataset_val, shuffle=False) + else: + sampler_train = torch.utils.data.RandomSampler(dataset_train) + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + + batch_sampler_train = torch.utils.data.BatchSampler( + sampler_train, args.batch_size, drop_last=True) + + data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, + collate_fn=utils.collate_fn, num_workers=args.num_workers) + data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, + drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) + data_loader_calib = DataLoader(dataset_calib, args.batch_size, sampler=None, + drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) + if args.dataset_file == "coco_panoptic": + # We also evaluate AP during panoptic training, on original coco DS + coco_val = coco.build("val", args) + base_ds = get_coco_api_from_dataset(coco_val) + else: + base_ds = get_coco_api_from_dataset(dataset_val) + + # for n, m in model.model.named_modules(): + # if isinstance(m, QConv2d) and "backbone" in n: + # m.input_quantizer.set_bit(bit=4) + # m.weight_quantizer.set_bit(bit=4) + # model.model.backbone_0_body_conv1.input_quantizer.set_bit(bit=8) + # model.model.backbone_0_body_conv1.weight_quantizer.set_bit(bit=8) + + qmodel.prepare_calibration() + calib_size, cur_size = 16, 0 + qmodel.eval() + # model = qmodel._replace_complicated_operators(model).cuda() + with torch.no_grad(): + for samples, _ in data_loader_calib: + # out = model(samples.to(device)) + qmodel(samples.to(device)) + cur_size += args.batch_size + if cur_size >= calib_size: + break + qmodel.init_QAT() + + process_group = torch.distributed.new_group([i for i in range(args.world_size)]) + qmodel_without_ddp = torch.nn.SyncBatchNorm.convert_sync_batchnorm(qmodel, process_group) + + if args.distributed: + qmodel = torch.nn.parallel.DistributedDataParallel(qmodel, device_ids=[args.gpu]) + qmodel_without_ddp = qmodel.module + n_parameters = sum(p.numel() for p in qmodel.parameters() if p.requires_grad) + print('number of params:', n_parameters) + + param_dicts = [ + {"params": [p for n, p in qmodel_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, + { + "params": [p for n, p in qmodel_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], + "lr": args.lr_backbone, + }, + ] + optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, + weight_decay=args.weight_decay) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) + + if args.frozen_weights is not None: + checkpoint = torch.load(args.frozen_weights, map_location='cpu') + qmodel_without_ddp.detr.load_state_dict(checkpoint['model']) + + output_dir = Path(args.output_dir) + if args.resume: + if args.resume.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(args.resume, map_location='cpu') + qmodel_without_ddp.load_state_dict(checkpoint['model']) + if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + args.start_epoch = checkpoint['epoch'] + 1 + + if args.eval: + test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, + data_loader_val, base_ds, device, args.output_dir) + if args.output_dir: + utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") + return + + print("Start training") + start_time = time.time() + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + sampler_train.set_epoch(epoch) + train_stats = train_one_epoch( + qmodel, criterion, data_loader_train, optimizer, device, epoch, + args.clip_max_norm) + lr_scheduler.step() + if args.output_dir: + checkpoint_paths = [output_dir / 'checkpoint.pth'] + # extra checkpoint before LR drop and every 100 epochs + if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: + checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') + for checkpoint_path in checkpoint_paths: + utils.save_on_master({ + 'model': qmodel_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_scheduler': lr_scheduler.state_dict(), + 'epoch': epoch, + 'args': args, + }, checkpoint_path) + + test_stats, coco_evaluator = evaluate( + qmodel, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir + ) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + + if args.output_dir and utils.is_main_process(): + with (output_dir / "log.txt").open("a") as f: + f.write(json.dumps(log_stats) + "\n") + + # for evaluation logs + if coco_evaluator is not None: + (output_dir / 'eval').mkdir(exist_ok=True) + if "bbox" in coco_evaluator.coco_eval: + filenames = ['latest.pth'] + if epoch % 50 == 0: + filenames.append(f'{epoch:03}.pth') + for name in filenames: + torch.save(coco_evaluator.coco_eval["bbox"].eval, + output_dir / "eval" / name) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) + args = parser.parse_args() + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + main(args) diff --git a/examples/quantization_aware_training/coco2017/DETR/model.py b/examples/quantization_aware_training/coco2017/DETR/model.py new file mode 100644 index 0000000..5951de3 --- /dev/null +++ b/examples/quantization_aware_training/coco2017/DETR/model.py @@ -0,0 +1,373 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Dict, List + +from detr.util import box_ops +from detr.util.misc import NestedTensor, nested_tensor_from_tensor_list, accuracy, get_world_size, interpolate, is_dist_avail_and_initialized +from detr.models.matcher import build_matcher + + +class PositionEmbeddingSine(nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + dim_t =torch.arange(model.num_pos_feats, dtype=torch.float32) + self.dim_t = nn.Parameter(model.temperature ** (2 * (dim_t // 2) / model.num_pos_feats)) + + def forward(self, x): + not_mask = (x*0).sum(1,keepdims=False)+1 + y_embed = not_mask.cumsum(1) + x_embed = not_mask.cumsum(2) + if self.model.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.model.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.model.scale + + pos_x = x_embed[:, :, :, None] / self.dim_t + pos_y = y_embed[:, :, :, None] / self.dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + +class Backbone(nn.Module): + def __init__(self, model): + super().__init__() + self.backbone = model[0] + self.pos_embed = PositionEmbeddingSine(model[1]) + new_bns = {} + for n, m in self.backbone.body.named_modules(): + if "bn" in n or "downsample.1" in n: + new_bn = nn.BatchNorm2d(m.weight.shape[0]) + new_bn.weight.data = m.weight + new_bn.bias.data = m.bias + new_bn.running_mean.data = m.running_mean + new_bn.running_var.data = m.running_var + new_bns[n] = new_bn + for n, m in new_bns.items(): + splited_name = n.split(".") + if len(splited_name) == 1: + setattr(self.backbone.body, n, m) + elif len(splited_name) == 3:#layera.b.bn + layer = splited_name[0] + idx = int(splited_name[1]) + name = splited_name[2] + setattr(getattr(self.backbone.body, layer)[idx], name, m) + elif len(splited_name) == 5: #downsample + layer = splited_name[0] + idx = int(splited_name[1]) + name = splited_name[2] + setattr(getattr(self.backbone.body, layer)[idx], name, nn.Sequential(getattr(getattr(self.backbone.body, layer)[idx], name)[0], m)) + + def forward(self, tensor_list): + xs = self.backbone.body(tensor_list.tensors) + out = [] + pos = [] + masks = [] + for name, x in xs.items(): + m = tensor_list.mask + mask = F.interpolate(m.unsqueeze(0).float(), size=x.shape[-2:]).bool()[0] + masks.append(mask) + out.append(x) + pos.append(self.pos_embed(x).float()) + return out, masks, pos + +class DETR_Impl(nn.Module): + """ This is the DETR module that performs object detection """ + def __init__(self, model): + """ Initializes the model. + Parameters: + backbone: torch module of the backbone to be used. See backbone.py + transformer: torch module of the transformer architecture. See transformer.py + num_classes: number of object classes + num_queries: number of object queries, ie detection slot. This is the maximal number of objects + DETR can detect in a single image. For COCO, we recommend 100 queries. + aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + """ + super().__init__() + self.input_proj = model.input_proj + self.query_embed = model.query_embed + self.class_embed = model.class_embed + self.bbox_embed = model.bbox_embed + self.aux_loss = model.aux_loss + self.backbone = Backbone(model.backbone) + self.transformer = model.transformer + + def forward(self, samples): + # if isinstance(samples, (NestedTensor)): + # samples = [samples.data, samples.mask] + features, masks, poss = self.backbone(samples) + + src = self.input_proj(features[-1]) + mask = masks[-1] + pos = poss[-1] + + hs = self.transformer(src, mask, self.query_embed.weight, pos)[0] + + outputs_class = self.class_embed(hs) + outputs_coord = self.bbox_embed(hs).sigmoid() + out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]} + if self.aux_loss: + out['aux_outputs'] = [ + {'pred_logits': outputs_class[0], 'pred_boxes': outputs_coord[0]}, + {'pred_logits': outputs_class[1], 'pred_boxes': outputs_coord[1]}, + {'pred_logits': outputs_class[2], 'pred_boxes': outputs_coord[2]}, + {'pred_logits': outputs_class[3], 'pred_boxes': outputs_coord[3]}, + {'pred_logits': outputs_class[4], 'pred_boxes': outputs_coord[4]} + ] + return out + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{'pred_logits': a, 'pred_boxes': b} + for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + +class SetCriterion(nn.Module): + """ This class computes the loss for DETR. + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + """ + def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses): + """ Create the criterion. + Parameters: + num_classes: number of object categories, omitting the special no-object category + matcher: module able to compute a matching between targets and proposals + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category + losses: list of all the losses to be applied. See get_loss for list of available losses. + """ + super().__init__() + self.num_classes = num_classes + self.matcher = matcher + self.weight_dict = weight_dict + self.eos_coef = eos_coef + self.losses = losses + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = self.eos_coef + self.register_buffer('empty_weight', empty_weight) + + def loss_labels(self, outputs, targets, indices, num_boxes, log=True): + """Classification loss (NLL) + targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] + """ + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + + loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) + losses = {'loss_ce': loss_ce} + + if log: + # TODO this should probably be a separate loss, not hacked in this one here + losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0] + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients + """ + pred_logits = outputs['pred_logits'] + device = pred_logits.device + tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1) + card_err = F.l1_loss(card_pred.float(), tgt_lengths.float()) + losses = {'cardinality_error': card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss + targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] + The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') + + losses = {} + losses['loss_bbox'] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag(box_ops.generalized_box_iou( + box_ops.box_cxcywh_to_xyxy(src_boxes), + box_ops.box_cxcywh_to_xyxy(target_boxes))) + losses['loss_giou'] = loss_giou.sum() / num_boxes + return losses + + def loss_masks(self, outputs, targets, indices, num_boxes): + """Compute the losses related to the masks: the focal loss and the dice loss. + targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] + """ + assert "pred_masks" in outputs + + src_idx = self._get_src_permutation_idx(indices) + tgt_idx = self._get_tgt_permutation_idx(indices) + src_masks = outputs["pred_masks"] + src_masks = src_masks[src_idx] + masks = [t["masks"] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(src_masks) + target_masks = target_masks[tgt_idx] + + # upsample predictions to the target size + src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:], + mode="bilinear", align_corners=False) + src_masks = src_masks[:, 0].flatten(1) + + target_masks = target_masks.flatten(1) + target_masks = target_masks.view(src_masks.shape) + losses = { + "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), + "loss_dice": dice_loss(src_masks, target_masks, num_boxes), + } + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): + loss_map = { + 'labels': self.loss_labels, + 'cardinality': self.loss_cardinality, + 'boxes': self.loss_boxes, + 'masks': self.loss_masks + } + assert loss in loss_map, f'do you really want to compute {loss} loss?' + return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) + + def forward(self, outputs, targets): + """ This performs the loss computation. + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + if is_dist_avail_and_initialized(): + torch.distributed.all_reduce(num_boxes) + num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if 'aux_outputs' in outputs: + for i, aux_outputs in enumerate(outputs['aux_outputs']): + indices = self.matcher(aux_outputs, targets) + for loss in self.losses: + if loss == 'masks': + # Intermediate masks losses are too costly to compute, we ignore them. + continue + kwargs = {} + if loss == 'labels': + # Logging is enabled only for the last layer + kwargs = {'log': False} + l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) + l_dict = {k + f'_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +class PostProcess(nn.Module): + """ This module converts the model's output into the format expected by the coco api""" + @torch.no_grad() + def forward(self, outputs, target_sizes): + """ Perform the computation + Parameters: + outputs: raw outputs of the model + target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch + For evaluation, this must be the original image size (before any data augmentation) + For visualization, this should be the image size after data augment, but before padding + """ + out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] + + assert len(out_logits) == len(target_sizes) + assert target_sizes.shape[1] == 2 + + prob = F.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # convert to [x0, y0, x1, y1] format + boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)] + + return results + + +def build(args, model): + # the `num_classes` naming here is somewhat misleading. + # it indeed corresponds to `max_obj_id + 1`, where max_obj_id + # is the maximum id for a class in your dataset. For example, + # COCO has a max_obj_id of 90, so we pass `num_classes` to be 91. + # As another example, for a dataset that has a single class with id 1, + # you should pass `num_classes` to be 2 (max_obj_id + 1). + # For more details on this, check the following discussion + # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223 + num_classes = 20 if args.dataset_file != 'coco' else 91 + if args.dataset_file == "coco_panoptic": + # for panoptic, we just add a num_classes that is large enough to hold + # max_obj_id + 1, but the exact value doesn't really matter + num_classes = 250 + device = torch.device(args.device) + + model = DETR_Impl(model) + + matcher = build_matcher(args) + weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef} + weight_dict['loss_giou'] = args.giou_loss_coef + if args.masks: + weight_dict["loss_mask"] = args.mask_loss_coef + weight_dict["loss_dice"] = args.dice_loss_coef + # TODO this is a hack + if args.aux_loss: + aux_weight_dict = {} + for i in range(args.dec_layers - 1): + aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + + losses = ['labels', 'boxes', 'cardinality'] + criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, + eos_coef=args.eos_coef, losses=losses) + criterion.to(device) + postprocessors = {'bbox': PostProcess()} + + return model, criterion, postprocessors diff --git a/examples/quantization_aware_training/coco2017/DETR/qconfig_lsq_8w8f.yaml b/examples/quantization_aware_training/coco2017/DETR/qconfig_lsq_8w8f.yaml new file mode 100644 index 0000000..e1fdf00 --- /dev/null +++ b/examples/quantization_aware_training/coco2017/DETR/qconfig_lsq_8w8f.yaml @@ -0,0 +1,11 @@ +BACKEND: virtual +W: + QSCHEME: per-channel-symmetric + QUANTIZER: + TYPE: lsq + BIT: 8 +A: + QSCHEME: per-tensor-affine + QUANTIZER: + TYPE: lsq + BIT: 8 diff --git a/sparsebit/quantization/complicated_modules/__init__.py b/sparsebit/quantization/complicated_modules/__init__.py new file mode 100644 index 0000000..3134623 --- /dev/null +++ b/sparsebit/quantization/complicated_modules/__init__.py @@ -0,0 +1,15 @@ +import torch.nn as nn + +COMPLICATED_MODULE_MAP = {} + +def register_complicated_module(sources: [nn.Module, str, ...]): + def real_register(complicated_module): + for src in sources: + COMPLICATED_MODULE_MAP[src] = complicated_module + return complicated_module + + return real_register + + +# 将需要注册的module文件填写至此 +from .multihead_attention import * \ No newline at end of file diff --git a/sparsebit/quantization/complicated_modules/multihead_attention.py b/sparsebit/quantization/complicated_modules/multihead_attention.py new file mode 100644 index 0000000..83cfb2e --- /dev/null +++ b/sparsebit/quantization/complicated_modules/multihead_attention.py @@ -0,0 +1,81 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from sparsebit.quantization.complicated_modules import register_complicated_module + +@register_complicated_module(sources=[nn.MultiheadAttention]) +class MultiheadAttention(nn.Module): + """MultiheadAttention层。 + 量化输入在build_quantizer中处理, 通过在输入上增加QIdentity层来解决。 + 是MultipleModulesQuantOpr的子类。 + """ + + def __init__(self, org_module=None): + super().__init__() + self.embed_dim=org_module.embed_dim + self.num_heads=org_module.num_heads + self.batch_first=org_module.batch_first + self._qkv_same_embed_dim=org_module._qkv_same_embed_dim + self.in_proj_weight=org_module.in_proj_weight + self.bias_k=org_module.bias_k + self.bias_v=org_module.bias_v + self.in_proj_bias=org_module.in_proj_bias + self.add_zero_attn=org_module.add_zero_attn + self.out_proj=org_module.out_proj + self.q_proj_weight=org_module.q_proj_weight + self.k_proj_weight=org_module.k_proj_weight + self.v_proj_weight=org_module.v_proj_weight + self.dropout = org_module.dropout + + self.q_in_proj = nn.Linear(org_module.embed_dim, org_module.embed_dim) + self.q_in_proj.weight.data = org_module.in_proj_weight[:org_module.embed_dim] + self.q_in_proj.bias.data = org_module.in_proj_bias[:org_module.embed_dim] + self.k_in_proj = nn.Linear(org_module.embed_dim, org_module.embed_dim) + self.k_in_proj.weight.data = org_module.in_proj_weight[org_module.embed_dim:2*org_module.embed_dim] + self.k_in_proj.bias.data = org_module.in_proj_bias[org_module.embed_dim:2*org_module.embed_dim] + self.v_in_proj = nn.Linear(org_module.embed_dim, org_module.embed_dim) + self.v_in_proj.weight.data = org_module.in_proj_weight[2*org_module.embed_dim:] + self.v_in_proj.bias.data = org_module.in_proj_bias[2*org_module.embed_dim:] + self.out_proj = org_module.out_proj + + def forward(self, query, key, value, key_padding_mask = None, + need_weights = True, attn_mask = None): + if self.batch_first: + query, key, value = [x.transpose(1, 0) for x in (query, key, value)] + + _, b, _ = query.shape + + query_proj = self.q_in_proj(query).reshape(-1, b*self.num_heads, self.embed_dim//self.num_heads).permute(1,0,2) + key_proj = self.k_in_proj(key).reshape(-1, b*self.num_heads, self.embed_dim//self.num_heads).permute(1,2,0)/math.sqrt(self.embed_dim//self.num_heads) + value_proj = self.v_in_proj(value).reshape(-1, b*self.num_heads, self.embed_dim//self.num_heads).permute(1,0,2) + qk = torch.matmul(query_proj, key_proj) + if key_padding_mask is not None: + qk = qk.masked_fill(key_padding_mask.repeat(self.num_heads, 1).unsqueeze(1), float('-inf')) + qk = torch.softmax(qk, dim=-1) + qkv = torch.matmul(qk, value_proj).permute(1,0,2).reshape(-1,b,self.embed_dim) + output = self.out_proj(qkv) + + + # if not self._qkv_same_embed_dim: + # attn_output, attn_output_weights = F.multi_head_attention_forward( + # query, key, value, self.embed_dim, self.num_heads, + # self.in_proj_weight, self.in_proj_bias, + # self.bias_k, self.bias_v, self.add_zero_attn, + # self.dropout, self.out_proj.weight, self.out_proj.bias, + # training=self.training, + # key_padding_mask=key_padding_mask, need_weights=need_weights, + # attn_mask=attn_mask, use_separate_proj_weight=True, + # q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, + # v_proj_weight=self.v_proj_weight) + # else: + # attn_output, attn_output_weights = F.multi_head_attention_forward( + # query, key, value, self.embed_dim, self.num_heads, + # self.in_proj_weight, self.in_proj_bias, + # self.bias_k, self.bias_v, self.add_zero_attn, + # self.dropout, self.out_proj.weight, self.out_proj.bias, + # training=self.training, + # key_padding_mask=key_padding_mask, need_weights=need_weights, + # attn_mask=attn_mask) + + return output, None \ No newline at end of file diff --git a/sparsebit/quantization/modules/__init__.py b/sparsebit/quantization/modules/__init__.py index e1f81fa..5d34c48 100644 --- a/sparsebit/quantization/modules/__init__.py +++ b/sparsebit/quantization/modules/__init__.py @@ -44,6 +44,8 @@ def real_register(qmodule): QGetAttr, QGetItem, QEqual, + Float, + Bool, Size, Transpose, Reshape, diff --git a/sparsebit/quantization/modules/activations.py b/sparsebit/quantization/modules/activations.py index a4a3b1d..1e61844 100644 --- a/sparsebit/quantization/modules/activations.py +++ b/sparsebit/quantization/modules/activations.py @@ -1,7 +1,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -from functools import partial from sparsebit.quantization.modules import QuantOpr, register_qmodule @@ -20,15 +19,11 @@ class QReLU(QuantOpr): def __init__(self, org_module, config=None): super().__init__() self._repr_info = "Q" + org_module.__repr__() - if isinstance(org_module, nn.Module): - self.inplace = org_module.inplace - else: - self.inplace = org_module.args[1] - def forward(self, x_in): + def forward(self, x_in, *args, **kwargs): """ReLU层的前向传播,但加入了input量化。""" x_in = self.input_quantizer(x_in) - out = F.relu(x_in, inplace=self.inplace) + out = F.relu(x_in, *args, **kwargs) return out @@ -91,7 +86,7 @@ def forward(self, x_in): return out -@register_qmodule(sources=[nn.Sigmoid, torch.sigmoid, F.sigmoid]) +@register_qmodule(sources=[nn.Sigmoid, torch.sigmoid, torch.Tensor.sigmoid, F.sigmoid]) class QSigmoid(QuantOpr): """量化Sigmoid层,拥有 ``input_quantizer`` 。 diff --git a/sparsebit/quantization/modules/linear.py b/sparsebit/quantization/modules/linear.py index 3fe0d6a..d015b2e 100644 --- a/sparsebit/quantization/modules/linear.py +++ b/sparsebit/quantization/modules/linear.py @@ -4,7 +4,7 @@ from sparsebit.quantization.modules import QuantOpr, register_qmodule -@register_qmodule(sources=[nn.Linear]) +@register_qmodule(sources=[nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear]) class QLinear(QuantOpr): """量化全连接层,拥有 ``input_quantizer`` 和 ``weight_quantizer`` 。 diff --git a/sparsebit/quantization/modules/math.py b/sparsebit/quantization/modules/math.py index 4e3fb99..71b101d 100644 --- a/sparsebit/quantization/modules/math.py +++ b/sparsebit/quantization/modules/math.py @@ -74,3 +74,41 @@ def forward(self, x_in, *args, **kwargs): x_in = self.input_quantizer(x_in) out = torch.mean(x_in, dim=self.dim, keepdim=self.keepdim) return out + +@register_qmodule(sources=[torch.sum,torch.Tensor.sum]) +class QSum(QuantOpr): + def __init__(self, org_module=None, config=None): + super(QSum, self).__init__() + self._repr_info = "QSum " + + def forward(self, x_in, *args, **kwargs): + x_in = self.input_quantizer(x_in) + return x_in.sum(*args, **kwargs) + +@register_qmodule(sources=[torch.cumsum,torch.Tensor.cumsum]) +class QCumsum(QuantOpr): + def __init__(self, org_module=None, config=None): + super(QCumsum, self).__init__() + self._repr_info = "QCumsum " + + def forward(self, x_in, *args, **kwargs): + x_in = self.input_quantizer(x_in) + return x_in.cumsum(*args, **kwargs) + +@register_qmodule(sources=[torch.sin,torch.Tensor.sin]) +class Sin(nn.Module): + def __init__(self, org_module=None, config=None): + super(Sin, self).__init__() + self._repr_info = "Sin " + + def forward(self, x_in): + return x_in.sin() + +@register_qmodule(sources=[torch.cos,torch.Tensor.cos]) +class Cos(nn.Module): + def __init__(self, org_module=None, config=None): + super(Cos, self).__init__() + self._repr_info = "Cos " + + def forward(self, x_in): + return x_in.cos() \ No newline at end of file diff --git a/sparsebit/quantization/modules/matmul.py b/sparsebit/quantization/modules/matmul.py index b945a9f..718f1d6 100644 --- a/sparsebit/quantization/modules/matmul.py +++ b/sparsebit/quantization/modules/matmul.py @@ -13,7 +13,7 @@ class MatMul(MultipleInputsQuantOpr): def __init__(self, org_module=None, config=None): super().__init__() - self._repr_info = "QMatmul " + self._repr_info = "QMatmul" def forward(self, x_left: torch.Tensor, x_right: torch.Tensor): out = torch.matmul(x_left, x_right) diff --git a/sparsebit/quantization/modules/python_builtins.py b/sparsebit/quantization/modules/python_builtins.py index b130366..8d5fcff 100644 --- a/sparsebit/quantization/modules/python_builtins.py +++ b/sparsebit/quantization/modules/python_builtins.py @@ -11,15 +11,10 @@ def __init__(self, org_module=None, config=None): super(QGetAttr, self).__init__() assert isinstance(org_module, torch.fx.Node) self.target_attr = org_module.args[1] - if self.target_attr != "shape": # dynamic shape needs forward - self.output = getattr(org_module.args[0], org_module.args[1]) self._repr_info = "QGetAttr " def forward(self, x_in, *args): - if self.target_attr == "shape": - return x_in.shape() - else: - return self.output + return getattr(x_in, self.target_attr) @register_qmodule(sources=[operator.getitem]) @@ -41,3 +36,13 @@ def __init__(self, org_module=None, config=None): def forward(self, x_left, x_right): return x_left == x_right + + +@register_qmodule(sources=[operator.invert]) +class Invert(nn.Module): + def __init__(self, org_module=None, config=None): + super(Invert, self).__init__() + self._repr_info = "Invert " + + def forward(self, x_in): + return ~x_in diff --git a/sparsebit/quantization/modules/shape.py b/sparsebit/quantization/modules/shape.py index 3579316..3e169ed 100644 --- a/sparsebit/quantization/modules/shape.py +++ b/sparsebit/quantization/modules/shape.py @@ -127,3 +127,28 @@ def __init__(self, org_module=None, config=None): def forward(self, x_in, *args): out = torch.permute(x_in, dims=self.dims) return out + +@register_qmodule(sources=[torch.stack]) +class Stack(nn.Module): + def __init__(self, org_module=None, config=None): + super(Stack, self).__init__() + + def forward(self, x_in, *args, **kwargs): + return torch.stack(x_in, *args, **kwargs) + +@register_qmodule(sources=[torch.unsqueeze, torch.Tensor.unsqueeze]) +class Unsqueeze(nn.Module): + def __init__(self, org_module=None, config=None): + super(Unsqueeze, self).__init__() + self.dim = org_module.args[1] + + def forward(self, x_in, *args): + return x_in.unsqueeze(self.dim) + +@register_qmodule(sources=[torch.Tensor.repeat]) +class Repeat(nn.Module): + def __init__(self, org_module=None, config=None): + super(Repeat, self).__init__() + + def forward(self, x_in, *args): + return x_in.repeat(*args) \ No newline at end of file diff --git a/sparsebit/quantization/modules/unary.py b/sparsebit/quantization/modules/unary.py index 8217c30..b3e236a 100644 --- a/sparsebit/quantization/modules/unary.py +++ b/sparsebit/quantization/modules/unary.py @@ -26,23 +26,42 @@ def forward(self, x_in): x_in = self.input_quantizer(x_in) return x_in +@register_qmodule(sources=[torch.Tensor.float]) +class Float(nn.Module): + def __init__(self, org_module=None, config=None): + super(Float, self).__init__() + self._repr_info = "Float" + + def forward(self, x_in): + return x_in.float() + +@register_qmodule(sources=[torch.Tensor.bool]) +class Bool(nn.Module): + def __init__(self, org_module=None, config=None): + super(Bool, self).__init__() + self._repr_info = "Bool" + + def forward(self, x_in): + return x_in.bool() -@register_qmodule(sources=[nn.Softmax, torch.Tensor.softmax, F.softmax]) +@register_qmodule(sources=[nn.Softmax, torch.Tensor.softmax, torch.softmax, F.softmax]) class QSoftmax(QuantOpr): def __init__(self, org_module=None, config=None): super().__init__() - assert isinstance(org_module, torch.fx.Node) - if "dim" in org_module.kwargs: - self.dim = org_module.kwargs["dim"] - else: - self.dim = org_module.args[1] + if isinstance(org_module, torch.fx.Node): + if "dim" in org_module.kwargs: + self.dim = org_module.kwargs["dim"] + else: + self.dim = org_module.args[1] + elif isinstance(org_module, nn.Softmax): + self.dim = org_module.dim self._repr_info = "QSoftmax " def forward(self, x_in, *args, **kwargs): if "dim" in kwargs: assert self.dim == kwargs["dim"], "parameter mismatch in softmax" - else: + elif len(args)>0: assert self.dim == args[0], "parameter mismatch in softmax" x_in = self.input_quantizer(x_in) out = F.softmax(x_in, dim=self.dim) @@ -58,3 +77,22 @@ def __init__(self, org_module=None, config=None): def forward(self, x): return x.clone() + +@register_qmodule(sources=[torch.zeros_like]) +class Zeros_like(nn.Module): + def __init__(self, org_module=None, config=None): + super(Zeros_like, self).__init__() + self._repr_info = "Zeros_like " + + def forward(self, x_in, *args): + return torch.zeros_like(x_in, *args) + +@register_qmodule(sources=[torch.Tensor.masked_fill]) +class Masked_fill(nn.Module): + def __init__(self, org_module=None, config=None): + super().__init__() + self._repr_info = "Masked_fill " + + def forward(self, x_in, *args, **kwargs): + out = x_in.masked_fill(args[0], args[1]) + return out \ No newline at end of file diff --git a/sparsebit/quantization/quant_model.py b/sparsebit/quantization/quant_model.py index a60865a..ef19a65 100644 --- a/sparsebit/quantization/quant_model.py +++ b/sparsebit/quantization/quant_model.py @@ -15,6 +15,7 @@ import onnx from sparsebit.utils import update_config +from sparsebit.quantization.complicated_modules import * from sparsebit.quantization.modules import * from sparsebit.quantization.observers import Observer from sparsebit.quantization.quantizers import Quantizer @@ -31,6 +32,7 @@ def __init__(self, model: nn.Module, config): super().__init__() self.cfg = config self.device = torch.device(config.DEVICE) + model = self._replace_complicated_operators(model) self.model = self._trace(model) self._run_simplifiers() self._convert2quantmodule() @@ -128,10 +130,30 @@ def _sub_build(src, module_name): ): module.prepare_input_quantizer(node, self.model) for input_node in node.all_input_nodes: - identity_module = getattr(self.model, input_node.target) - _config = self.cfg.clone() # init - update_config(_config, "A", _sub_build(self.cfg.A, node.target)) - identity_module.build_quantizer(_config) + input_module = getattr(self.model, input_node.target) + if isinstance(input_module, QIdentity): + _config = self.cfg.clone() # init + update_config(_config, "A", _sub_build(self.cfg.A, node.target)) + input_module.build_quantizer(_config) + + def _replace_complicated_operators(self, model): + finished = False + while(not finished): + finished = self._recurrency_replace_complicated_operators(model) + return model + + def _recurrency_replace_complicated_operators(self, module): + finished = True + for n, m in module.named_children(): + if m.__class__ in COMPLICATED_MODULE_MAP: + setattr(module, n, COMPLICATED_MODULE_MAP[m.__class__](m)) + finished = False + break + else: + finished = self._recurrency_replace_complicated_operators(m) + if not finished: + break + return finished def _trace(self, model): skipped_modules = self.cfg.SKIP_TRACE_MODULES diff --git a/sparsebit/quantization/quantizers/lsq.py b/sparsebit/quantization/quantizers/lsq.py index c94b02b..f8a245a 100644 --- a/sparsebit/quantization/quantizers/lsq.py +++ b/sparsebit/quantization/quantizers/lsq.py @@ -33,17 +33,25 @@ def calc_qparams(self): if self.fake_fused: return self.scale, self.zero_point if not self.init_params: - x_oc = self.observer.data_cache.get_data_for_calibration( - Granularity.CHANNELWISE - ) - if x_oc.min() < 0 and not self.qdesc.is_symmetric: - warnings.warn( - "Found data less than 0, reset quantizer scheme as symmetric" - ) - self.qdesc.set_symmetric(True) if self.is_perchannel: + x_oc = self.observer.data_cache.get_data_for_calibration( + Granularity.CHANNELWISE + ) + if x_oc.min() < 0 and not self.qdesc.is_symmetric: + warnings.warn( + "Found data less than 0, reset quantizer scheme as symmetric" + ) + self.qdesc.set_symmetric(True) scale = 2 * x_oc.abs().mean(axis=1) / math.sqrt(self.qdesc.qmax) else: + x_oc = self.observer.data_cache.get_data_for_calibration( + Granularity.LAYERWISE + ) + if x_oc.min() < 0 and not self.qdesc.is_symmetric: + warnings.warn( + "Found data less than 0, reset quantizer scheme as symmetric" + ) + self.qdesc.set_symmetric(True) scale = 2 * x_oc.abs().mean() / math.sqrt(self.qdesc.qmax) self.scale = nn.Parameter(self._broadcast_qparams(scale.to(self.device))) self.zero_point = self._broadcast_qparams(torch.zeros_like(self.scale)) diff --git a/sparsebit/quantization/tools/calibration.py b/sparsebit/quantization/tools/calibration.py index 6d5ea48..51f621d 100644 --- a/sparsebit/quantization/tools/calibration.py +++ b/sparsebit/quantization/tools/calibration.py @@ -73,7 +73,9 @@ def feature_layerwise_calibration(self, device): assert batch_num is not None - module = getattr(self.model, node.target) + module = self.model + for n in node.target.split("."): + module = getattr(module, n) if isinstance(module, QuantOpr) and getattr( module, "input_quantizer", None ): diff --git a/sparsebit/quantization/tools/graph_wrapper.py b/sparsebit/quantization/tools/graph_wrapper.py index 7ebf14a..c23c902 100644 --- a/sparsebit/quantization/tools/graph_wrapper.py +++ b/sparsebit/quantization/tools/graph_wrapper.py @@ -130,7 +130,9 @@ def build(self, model: fx.GraphModule, hook_wrapper: Callable): if node.op in ["placeholder", "output"]: # skip IO empty node continue if node.op == "get_attr": # use model.xxx to get constant nn.Parameter - module = getattr(model, node.target) + module = model + for n in node.target.split("."): + module = getattr(module, n) else: module = named_modules[node.target]