From 9af66eaf4d937722bfefd8f995e421e25779434f Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:12:48 +0800
Subject: [PATCH 01/59] Add files via upload

---
 .../__pycache__/nano.cpython-38.pyc           | Bin 0 -> 1752 bytes
 .../yolox_pedestrian/coco_format/nano.py      |  48 +++++
 .../yolox_pedestrian/coco_format/yolox_s.py   |  25 +++
 .../__pycache__/yolox_voc_nano.cpython-38.pyc | Bin 0 -> 3782 bytes
 .../voc_format/yolox_voc_nano.py              | 145 +++++++++++++++
 .../voc_format/yolox_voc_nano_adam.py         | 176 ++++++++++++++++++
 .../voc_format/yolox_voc_s.py                 | 123 ++++++++++++
 7 files changed, 517 insertions(+)
 create mode 100644 exps/example/yolox_pedestrian/coco_format/__pycache__/nano.cpython-38.pyc
 create mode 100644 exps/example/yolox_pedestrian/coco_format/nano.py
 create mode 100644 exps/example/yolox_pedestrian/coco_format/yolox_s.py
 create mode 100644 exps/example/yolox_pedestrian/voc_format/__pycache__/yolox_voc_nano.cpython-38.pyc
 create mode 100644 exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
 create mode 100644 exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
 create mode 100644 exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py
diff --git a/exps/example/yolox_pedestrian/coco_format/__pycache__/nano.cpython-38.pyc b/exps/example/yolox_pedestrian/coco_format/__pycache__/nano.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5a2714b6d8e3b14aa9c82ade2dce7a62be5bc18
GIT binary patch
literal 1752
zcmZWq&2Jk;6rY*>@UFdXX^ASrM+*|7EC?GA$Ex9@q#TGFRFKL_Xf@uM*z4|BXVx?^
z_9;P?koXr+a?FKu{|to0WlkVY5hrd4ap1jmQk*i@{PxYv?wj}C{C?vvy4^N`Mz4Q2
zeA^}DcO0xX8wT$~*Y5!cB4|dU<&)+tViawbS-BlKl-wr567~TRwsiKb$QJZ237r%8
zk_<v>tdDoA1P80lIvwcOz>p-Of<){Q*`b08cHl*paD^?L0~*<nh)p5~T(%U0mlL|;
z2W<yBy3@M=HL2;G?6X65L?BAAV@ohVTUdaOumN4+0JfSLul6VH!%l-i-94g5RJiNp
zup?SvdFuq;ydZqQi=qwqjOYMf5&__4;Q~G@2n5$1jIMqA$7heP{<1NGIH3Ks5wep*
zh)`j4Av>M^^=$nwT*qTLSwp~74DBg=ZqA+)yyjZgx?f2lb*)lf#7UWyvn#dYX%X|H
zC~ID)Wubc$T^6$|U-K*#|C<ljjH9QOR88@W7sqLl)^ThcA**_9oQJ7^M>{R5X&viy
zCZixPHBXaRCp?p;qj(|8yje1S4Y4duR%&Bc99`%tOKamP$+KnxHrYut37dFww~7m%
zOB2X~4`D7(cc+!{Fuho$3Ot^rV_J{_#&0^sG>?;vYb`Yp$ksC3Ii|*smy>aPO#a^L
zkIP*4C(|tD{aa-+&1C`A`tr`FckcCJR}H}PDwF-aGAnoE|H|0MGR8Zl%6Z){cv1GM
zJ>^5yyU_LX0EAwk?#k~~>RQd1%>p2E@2rNK2@>+Gp$pIw`3Sl`0BER877TedA@^HK
zAf+=<(89{Cg`L|AP!ABjCA^WfaOc3y2{3cw&RyY)_V@OpHOF!5+?+SJ=B>H6kF54!
z#XI!p?1)le>cm4Xi_X~yco_`N2?hg3ao@!t-vF4CLvloJk*{7{P>>F-*~Hps$^+?8
z5Df!#9(WDKt-<J@<3E4i8g6fl9$!VW|F99drj?gsnn?{D(rKY<UL?{mJjM@sos0)i
z=y!#&WTlOl!<iNJG!L1fTZZQ9B?w^&SHUuVLry%gS1>>ifY1PbE}gxMr60*UhW%vL
zyOEW^PkpoZ<P+mGD~*Ce%Qy`ejE|4)4>q?4OY`0*l8Y0#97sx_8Y!0JbBLz{qDsa<
zKbh&sZ^ZF1)pGRB+K;z?n;mR~0g6YY35q9ZNY!ih(<+206&6#yiGZ3`9fWm+*8z;f
zt18<w_82xdmoVfM<T{h_Fa??G&{s&j$k)?hmKKsLbsfKb%JW<3${X_J{;({hdIw*5
zx`IzCbPZNDUM+-qPMfaLHd_M>SOD~%b<b5Zq?>S(kRNIl@@vOfWjIyLHI5%ldA3|>
zgre|Y!E{gMn8+-P<CQpGfK`RZ*<3jR7#b<((cp436T0dmjEz^9DjD~RLg8T>e_2z|
hZu3^;Z0$jeD}I|y8=81GmeJkByI-T|JO+7L{{j`~!sP$}

literal 0
HcmV?d00001

diff --git a/exps/example/yolox_pedestrian/coco_format/nano.py b/exps/example/yolox_pedestrian/coco_format/nano.py
new file mode 100644
index 000000000..90cc639ad
--- /dev/null
+++ b/exps/example/yolox_pedestrian/coco_format/nano.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch.nn as nn
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.25
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
+
+        # Define yourself dataset path
+        self.data_dir = "datasets/pedestrian_coco"
+        self.train_ann = "train_annotations.json"
+        self.val_ann = "valid_annotations.json"
+
+        self.num_classes = 1
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
diff --git a/exps/example/yolox_pedestrian/coco_format/yolox_s.py b/exps/example/yolox_pedestrian/coco_format/yolox_s.py
new file mode 100644
index 000000000..97291a30d
--- /dev/null
+++ b/exps/example/yolox_pedestrian/coco_format/yolox_s.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.50
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+        # Define yourself dataset path
+        self.data_dir = "datasets/pedestrian_coco"
+        self.train_ann = "train_annotations.json"
+        self.val_ann = "valid_annotations.json"
+
+        self.num_classes = 1
+
+        self.max_epoch = 10
+        self.data_num_workers = 4
+        self.eval_interval = 1
diff --git a/exps/example/yolox_pedestrian/voc_format/__pycache__/yolox_voc_nano.cpython-38.pyc b/exps/example/yolox_pedestrian/voc_format/__pycache__/yolox_voc_nano.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4472ab2e8aabe86e166b380acd0373655427d5e7
GIT binary patch
literal 3782
zcmai1ON<;x8SeLM9{aEp;|OB&6oX)w1c87!9ulwDiDa)0c1%1xn%=3}*&cWIjH`R>
z-KFQWh8)mxjB<!zSBS$(5$Bw^BP9|-LWq)3ggBrT7s3%IgzvAO-JSITy0`wRuE$@0
z{q_A{{rgIz5ombMe13oR*Yle8H~N@<EPQ+mCI3DI)0m!Usov3bsvC)snjK5k&BX55
zXfqS1<EURZ^*Y|9&+quEEl6vfx~^T(n8oae8nd~zZFWp%-qc#o0anmjx}59rBHSD#
zgN=|yMZ{u(M)UH<aHUHlrXO?ikuRabwT{lTj`5ndrZb%x4?SqEtCjSQ#Tv|E?n50`
zHkrqK)EqX$0#-xKeND5pjt47d1?lYSQ>#$jMxl;;1)|Ujz0|giU1Lwf22l@9W<XlZ
zgtQB%@9ug^`bDs(@4=k2+OEee)a<hw#?+VtS!WuSbXN{&iQc)N{NvkiKmEsc4<C4b
z-<JA<pq}McL)!Uh$b~XadSMu6aS?{HmW|S|n?!leb7?a^EY_v{AZGY62D!9`5jEt)
zBrc>Uc$BCH%wLNWj`<$n7=~Gta#`nDw2HPg-WUzxZ!71?+P*IRu+krf``W*6FRl+#
zzStinakRKJ=#EmJ75U<w>sPP8x(HP{M3fE_zE~aKkTaeaB95}f`-5({HV|o4O!}xH
zi?YGOa8tP0<P1t)htTw<?x^=O`c@sszHqqR<*q^flgG!egK`Nae+Z$@TpJs72z~9Y
zD;S-9qNm2#OwF;CT4Ni>1r#u^XO5lH!Z;hnDIMmsV8<G}C4IXm+Pu;%-O}5p1NPA4
z?fRv$hX|m0KiI9Y+Tj-tv%YcyJ7h40&`mP(3lOEYtL<q^+7CWH)`23uF#6`UAv~Z+
z2Yw7l`<|z8VXpK(*nj^I*H&+|dv8BY;P^$m<w!Rjuu+2Z4&pq{@*>K*TpC#>YcEDc
zcYOs;ewIm-4|D0IIIgT1r7c71*QB0`CtyM)o<vRh3P*HUXURYhLeuMb9ewLOxt>B1
z>+xjaLNe$^Nq%wRXo|E|kAi5+l(vF_^y%lu3(GfFs_LsR@rWHDPPABqH)3+fE!b0w
zv#`4!Wf@O$umeH&L7em6k5B*R%3rr0wp(?=Ig7i5^SZ**f?6*ZpM;g-QzSl3f`A~1
z#RS<e&O=B$8V-|9X{}>%IY%bPQ4(+Qa5at)D=lAu?6i)bk5-d7<B<^OXzpX0KTE2-
z)u?-Sb&zq<q9Koa@W?}F(3_uz&<xKG^wWA^oQA9$bpU#+al)Hj!o=Z2$S2?{Apa8;
z0y5DoAZv_mK-U3u-P9g?sWbKgJgaaChDERq!re2N)dL6hYo*QX3f?;=^8x3e^vj?$
z>jc%q-hg>K6d-k!21*lU24xmyj@5Po%nAVY`O*-t1D~gMPw#0@X=A<giYG8;ew)^M
zQoKbnGhl0d#~9a2r>vDaBaJm$ku`VhF|Bb1Bs=@2F>aLgvccvqXb2`~S<}nL&-HP$
zY$9mpS1J%WvVQyer6peQZV?YM*}Ns9ID^P?;tn}`2S`;Yzo}?O$Y2RK#MME>xR5h1
zXKQrQ{IGN8+8~eO?v%b`LJGk@(k@8<zS#@@^sC|X!F%oA@9zKhmrpJJrQLh|($A8=
zKl*bUouBBi=ia;iZo9X6@VRHax87;@-hD?s?Sp4=T7bSUdaGf|VWu?mg2^C_Ho_!Y
z1+@1q>Cr6)B*WS`je0!Hd4brB(_WYZlF}UtJ`{uQ!3JsbrIRYN#YKFIHiUE;?+K2O
z_hE3BCsDz{>Ig^r0mOx@;d01_<hJ`<%#j{N;hZhEZ0Y8Q-dZJws8;ov7z|-#UdWj^
z8;%NC2(rlGcku!gs2HW|<-tJQg>o4T5wR&x2Vzq;CW<P(t&ho|3Q*dq_!8N@Kw^=^
zmq|QBg1jYaBna-(>JQ>f5JSQ-$^vqSc##HBEQlo%mq{#>pl!hwSON}|7K>qelfn-c
zCal_(nea?pp@Fl?>r)$O&8V0cFH>(b&aq#*n^s2!XVS?AVKnOfAKSk}X1qj#(f~O}
zOC$<>;oS$3=;gAm5EWL9k8{Ng8LP~y`sqH%I}n<F-Z#J!7G9%zKeRq@YE(bbX4BW?
z{g*T67#8@&&^@jBkz-9}QvV~ztLvWLG`3D1^3K${C)kIGlXz3H&l{+mWFN&gX5ZRH
zR&#=Fa8*`pvtxi!EL^9C_#yZLyj1Z;5%g>QdcOgNsjgRJ>}JiXb$6z8_H<mg;2l;g
zT~>e79D5}=hBYo|lbLg+1D5c>5`O7}C7KhKFw`l5Dx&0w8pKy%xwP+t<eB&?>f&n<
zEkk^r>fa#o9K<rh!p+x5YikL=)iT9Z>bOQiohm`2H7~AHgF-PSy~l_ZH|SFV)-RCQ
zD98!A+Qlsh(8)k_k#CKPI6?A79+$Ogl+7<4L++5I=5F$PBV5wBpC(P>Hff}=X$8lS
z72l;LTw0g$`^V<V0Gp<z5ERue?$AJL{a-NT$NiLV=Cquq&jL(SF#Hcd7yx2=pc|U;
zk!Lkc1<(&2+tcT#z%`}q1h^=%qoTm|O;nD-rSd1FJEc(~|KBmkHu5pbzUXo~%FLLB
zYXtcjj^CafzpgHc%dmeG!Qm}ktU)noE+8*VB9FLJdfh>`R;&w9Y?kI!Q5izZm9rIH
zk=aT~F8Y`tViF-ut<IO`i<BBELR#))r=b?EX0EGANBSO*{02q-QSVPNxXKg|y|XIb
zCo%oVaku6UBlc)~O9yTb|9nb|C}l@l6oizVAsM6;DGcw8qNM6k8MXK#S^hl|YGe1v
z#>OO{@A4!G!znvY(Kt#9)&CexRB}ZPdff|71<wcPRC}vYMMo6`6EAKIRbR)xwh7x+
zyhsNZ6&s||Q!%lSWr8wH@goR1SG7$=hUikKilpQGO4U@F>RqU|d6E7N%gHv%Q9ZmQ
H>*jv}g~8Jr

literal 0
HcmV?d00001

diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
new file mode 100644
index 000000000..a3d02e62a
--- /dev/null
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+import os
+import random
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 1
+        self.depth = 0.33
+        self.width = 0.25
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+            image_sets=[('train')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+             batch_size =  batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+            image_sets=[('valid')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size =  batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator
diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
new file mode 100644
index 000000000..8e05e67ff
--- /dev/null
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import random
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+from yolox.exp import Exp as MyExp
+from yolox.data import get_yolox_datadir
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 1
+        self.depth = 0.33
+        self.width = 0.25
+        self.scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.eps = 1e-8
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
+
+    def get_optimizer(self, batch_size):
+        if "optimizer" not in self.__dict__:
+            if self.warmup_epochs > 0:
+                lr = self.warmup_lr
+            else:
+                lr = self.basic_lr_per_img * batch_size
+
+            pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
+
+            for k, v in self.model.named_modules():
+                if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+                    pg2.append(v.bias)  # biases
+                if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+                    pg0.append(v.weight)  # no decay
+                elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+                    pg1.append(v.weight)  # apply decay
+
+            optimizer = torch.optim.Adam(
+                pg0, lr=lr, eps=self.eps, amsgrad=False
+            )
+            optimizer.add_param_group(
+                {"params": pg1, "weight_decay": self.weight_decay}
+            )  # add pg1 with weight_decay
+            optimizer.add_param_group({"params": pg2})
+            self.optimizer = optimizer
+
+        return self.optimizer
+    
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+            image_sets=[('train')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+            image_sets=[('valid')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator
diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py
new file mode 100644
index 000000000..fa27310ab
--- /dev/null
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py
@@ -0,0 +1,123 @@
+# encoding: utf-8
+import os
+import random
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+from yolox.exp import Exp as MyExp
+from yolox.data import get_yolox_datadir
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 1
+        self.depth = 0.33
+        self.width = 0.50
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+            image_sets=[('train')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+            image_sets=[('valid')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator

From 6b262d246ca94cd81c3167c9b38cdc70e0617b08 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:15:07 +0800
Subject: [PATCH 02/59] Add files via upload

---
 .../__pycache__/yolox_voc_nano.cpython-38.pyc | Bin 0 -> 3838 bytes
 exps/example/yolox_voc_nano/yolox_voc_nano.py | 147 +++++++++++++++
 .../yolox_voc_nano_adam.py                    | 178 ++++++++++++++++++
 exps/example/yolox_voc_tiny/yolox_voc_tiny.py | 146 ++++++++++++++
 4 files changed, 471 insertions(+)
 create mode 100644 exps/example/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc
 create mode 100644 exps/example/yolox_voc_nano/yolox_voc_nano.py
 create mode 100644 exps/example/yolox_voc_nano_adam/yolox_voc_nano_adam.py
 create mode 100644 exps/example/yolox_voc_tiny/yolox_voc_tiny.py

diff --git a/exps/example/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc b/exps/example/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b3dc19292760e1ae41ca89f792124e0826856ce
GIT binary patch
literal 3838
zcmai1O^h5z74HA(?)lqYI{^oX%}<a9!LDsUL5ZDl@Oqs<ymqh?*@Kc+&s6Pf&vy5W
zt9$I-rRTIpK|;$B<q(Bkak7MR&WSrxA|WJ%C<#S~1G3;kjyNHFuX^_P06kl;U;T8|
zt9sx2>g{Xwy078sJQH>QIj3oVqsrmOLgf-lHiBRp(_^huzV(i->PBLAOucNgI+kj)
z6Q@&|wz(aby8~~>XU49s+RV6`)H-#gTX7?4cAC0&Rbw`D9%#(r^)0hgWBN_4Re6L}
zwdGBB@wtUN*I&7QyQNF>%K8Ye**?#MjbS`o4_KIoED}8$d-yS@m243T*T~+E@s_rt
zGo2X^+>Xg=%wqNf9ah(w!z$2h)?hC4pgF9`d{%{4c}ugjjthHcgn_Ag`k_@ww^3-Z
zY#Acga=p;Dj2&ZF!xo_jCNm%{W<uJzGpOviN_x4!tM9^`Q`(NpENJ#A4P&azfvho&
zj$ru_Ezv*y(|^4B{?mVK_ff&~`?l2Q1@$bo>e9}}BQBJ2(hY(rjq)In)pVQ$y*SJ=
zo=KbWQNAke`w_#-7-rHMh18IZ;wYD{;9;y9@cl{@bA0de^-++9370jVhF!EJ(fXJU
z-^zHr@=%vvP|hC&54C@{7gmP}Ul@$zC@j6TfC(~)Fd4;sp>)>0VGnbr!(-L?(T1pC
zH?t^N4MNi!x}$zy(>H7I`TX9#mU`rby~l&%pj<}D9zZDPY7>J_d!VgV1S2QMdSXn>
z#GF`(HL>9rxSP3sbK(>h#@Q%N;V_T++t#E~P`z@b%`MELQn*{>Nf$ltj#nr^lRofw
zs;o-zrW(9fTRs93GMFOiCK>q+h(g=ZcC|(AjW14gz(voEfw^S}7jV%56a&(}>nZ@4
z%l%It{_79dx;NVW_n#&dyxVR$vXTthIEG{WD2vi857Qo(Mw-g%_rtumx;zxgc_vLh
z%A}jXFKIqbT87lGNj(uyz=Tpf2~BzmFyx;vkby3Qrq}Ru^vyGL^b~7YkH_;D;$bh0
zvlr&~KasZTQCMu5(pGUQJ$l`Eap}f#sos8xhwKpoLW?w<MnuPP3-(l_H0Z5{Y0BfQ
z<0*{Zk22o>$;tPw{&n*~yHz7Vv#3Xat|<u3tMxMRWmqY`LgK3=2nd46OpyKJ41~19
z(J0=K)+!d4vt)7<#?dAZx>1OrXn6vhr**t++>N7@heDjDZ;#FV3{B;B!`@nVm~zpg
zA;(W}|AbDVH#-TT8LsW?Cw1RA30X600Q6@4$Z7Tn6MIh+3ISgM`C}*q<XB5|K-QSp
zfUX1RRuX&SCeFkI)U3QhFwFfejhVX!!MqQ64ypy1l;F+0ZBqf-FTBDp%o@RSueVCQ
zsMk>HC=HY*$_&aZt8M!eAKz%fTw#c}fzT5>CwH}{w25B0`4bp3w*_h&JSpBInHsRR
zzHLmZg;P`uozZMnT9Gxk?Fp^%IX1&)-!UfjqE^(|+yxEcgjrVfqW%kg(kL1Tn-j|=
zj2u~e{rcrap7UNF4O7{;CBi6$NV63HTh818Ru#@~DsB<dU&JNw$}nVH$mYxG3i+Dv
zbvCaJvoPu%%;#Zt#_z2~IUs1AKYR9B>E^V=y)Z6Y&Yc(Eg%2K@eg99t9$oZ5YWIJ4
z@3+5tYT+;K{x2{8JpTLRKey5Or4D=kqw62G`x}qGe$M^P2kricAE>AO=sZ40Jb0qt
z4H6E^rJ3bS`boGR#9<eBe`raU?k>O^8;Fvy&x4HT2-qm;2N`fGD<i>2V%U4MPP2K^
zNz_inB~-<W5VFE}UvTh%hYh4z9OfLHju@pEa4%#PS3@?Ulf1_T@oP!ooGrC%S;_Ve
zuuHU2uIe!{9Kpyem(3_0jdNHC2Fc(GLGV<8OP9<2p;*J@(jOs=6P^skhOAE~D(7u|
zUixL!(oV&<$nJR(3nb2xI7fm`OH@fv3`lD*j8b8e>X#PKafn44aD~JYiK`@DB0<~2
zQD6xiptM*7(;F0%urOxjuFROH;$<2*qfY(M23k!;1mcI(+lVskH_JuT9p{`$Cmjai
zxc`4_zeQ&JfW!@&WR{kQ6&{1N`=RJ(vZfFgl#Ry;q)Emqv&#N&Vv?)_q3LHlL*0dz
zZj`@ItxudP=@x$bZ8p`WzmJ_+$FM*;hVE*O&m3zyiuxZrZcTUfhOv2KkBknjdxVmR
zO^HkuCA|&hC?zQhG6&WUa+*B~>Jus8{-k7%RkjW0;y!giA#Z|0KvpG%<o=*Ks152M
zo$}5#CY7{Nw(c|wXIIA^3@T#PqQYwLm=m`E?Xda<ZTjVG;ea??5QkTIAdbeAI1J?_
zutym0vxWFJEN>a&J9tSe=kV;aP{sGk=I2O%p2S6nB?Lz$TOF^g#Qavv6qM14>m-zK
z2_UUG@hUYaEWb(GK5oU2sH$M=(LxG%;x%e~9YWgqQ1p;<jq@l*0!HU9tB0{Rw|D@&
zJ@%Tt$?uMFf8*+!Hi_FbBLz>(KLD<{LrYX>U2xlhdC~`~(NYMFau+|Qfz<lHz{n5!
zBaF{zIZdAdq#6ol3R3?86ny|q_jN-vK69;xsUZ2OW4rp?A&4EC?Fhsu=cA$^My@!3
zm`bLQ2^B_RGIQIU*htkVF{4{+KWSqY?iZwP0D(OPfv#?kE3khY;=o(FSjEI3JD*Nr
z8h@A2B;DRHUCCDkh&D|!QdIKLs>qp=w8(5VNhV@^Al68P^l9l|`d$pFk>aFPIpz(u
zaQWpxeY8K{@%XP({O|Yv41>!g0x>+JVtg9ekDbcq>|WgNk8kO~?cSe9855=L$e8?q
z@;2m&lsyH(-EkO~Ju2B2#4_Ry5^7@^*;t?E^gSNOL2$^<6Eu$UMfHEi43ta}gkE#~
z6aGd2v}(uyNE8mz!EMBb3GP*b{Ct`U%8W#p`ew_vL%<iDIw??aOKxQ4CEy@?<0+Hg
v)EDa`wL-_fwt*8UgOFnFkc}!WQ#q}Bq1^ZjHb!J~5H47cn6E5IZ_WP#h^gEf

literal 0
HcmV?d00001

diff --git a/exps/example/yolox_voc_nano/yolox_voc_nano.py b/exps/example/yolox_voc_nano/yolox_voc_nano.py
new file mode 100644
index 000000000..0bba25ffa
--- /dev/null
+++ b/exps/example/yolox_voc_nano/yolox_voc_nano.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+import os
+import random
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+import sys
+sys.path.append(r'D:/YOLOX')
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 20
+        self.depth = 0.33
+        self.width = 0.25
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'test')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator
diff --git a/exps/example/yolox_voc_nano_adam/yolox_voc_nano_adam.py b/exps/example/yolox_voc_nano_adam/yolox_voc_nano_adam.py
new file mode 100644
index 000000000..1663c88f9
--- /dev/null
+++ b/exps/example/yolox_voc_nano_adam/yolox_voc_nano_adam.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import random
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+import sys
+sys.path.append(r'D:/YOLOX')
+from yolox.exp import Exp as MyExp
+from yolox.data import get_yolox_datadir
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 1
+        self.depth = 0.33
+        self.width = 0.25
+        self.scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.eps = 1e-8
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
+
+    def get_optimizer(self, batch_size):
+        if "optimizer" not in self.__dict__:
+            if self.warmup_epochs > 0:
+                lr = self.warmup_lr
+            else:
+                lr = self.basic_lr_per_img * batch_size
+
+            pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
+
+            for k, v in self.model.named_modules():
+                if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+                    pg2.append(v.bias)  # biases
+                if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+                    pg0.append(v.weight)  # no decay
+                elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+                    pg1.append(v.weight)  # apply decay
+
+            optimizer = torch.optim.Adam(
+                pg0, lr=lr, eps=self.eps, amsgrad=False
+            )
+            optimizer.add_param_group(
+                {"params": pg1, "weight_decay": self.weight_decay}
+            )  # add pg1 with weight_decay
+            optimizer.add_param_group({"params": pg2})
+            self.optimizer = optimizer
+
+        return self.optimizer
+    
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'test')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator
diff --git a/exps/example/yolox_voc_tiny/yolox_voc_tiny.py b/exps/example/yolox_voc_tiny/yolox_voc_tiny.py
new file mode 100644
index 000000000..499b2a59a
--- /dev/null
+++ b/exps/example/yolox_voc_tiny/yolox_voc_tiny.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import random
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+import sys
+sys.path.append(r'D:/YOLOX')
+from yolox.exp import Exp as MyExp
+from yolox.data import get_yolox_datadir
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.375
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'test')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator

From 812f20e47e279a02806e360c48183d92a0451ef1 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:17:03 +0800
Subject: [PATCH 03/59] Rename nano.py to yolox_nano.py

---
 exps/example/custom/{nano.py => yolox_nano.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename exps/example/custom/{nano.py => yolox_nano.py} (100%)

diff --git a/exps/example/custom/nano.py b/exps/example/custom/yolox_nano.py
similarity index 100%
rename from exps/example/custom/nano.py
rename to exps/example/custom/yolox_nano.py

From 9950ca2d88ed246a3773b5d0af8f1ae3a42e2011 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:19:37 +0800
Subject: [PATCH 04/59] Rename nano.py to yolox_nano.py

---
 .../yolox_pedestrian/coco_format/{nano.py => yolox_nano.py}       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename exps/example/yolox_pedestrian/coco_format/{nano.py => yolox_nano.py} (100%)

diff --git a/exps/example/yolox_pedestrian/coco_format/nano.py b/exps/example/yolox_pedestrian/coco_format/yolox_nano.py
similarity index 100%
rename from exps/example/yolox_pedestrian/coco_format/nano.py
rename to exps/example/yolox_pedestrian/coco_format/yolox_nano.py

From d09ad1d9f0376fab246332456494703ca0a82672 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:21:17 +0800
Subject: [PATCH 05/59] Delete exps/example/yolox_voc directory

---
 exps/example/yolox_voc/yolox_voc_s.py | 138 --------------------------
 1 file changed, 138 deletions(-)
 delete mode 100644 exps/example/yolox_voc/yolox_voc_s.py

diff --git a/exps/example/yolox_voc/yolox_voc_s.py b/exps/example/yolox_voc/yolox_voc_s.py
deleted file mode 100644
index e5cdb6103..000000000
--- a/exps/example/yolox_voc/yolox_voc_s.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# encoding: utf-8
-import os
-
-import torch
-import torch.distributed as dist
-
-from yolox.data import get_yolox_datadir
-from yolox.exp import Exp as MyExp
-
-
-class Exp(MyExp):
-    def __init__(self):
-        super(Exp, self).__init__()
-        self.num_classes = 20
-        self.depth = 0.33
-        self.width = 0.50
-        self.warmup_epochs = 1
-
-        # ---------- transform config ------------ #
-        self.mosaic_prob = 1.0
-        self.mixup_prob = 1.0
-        self.hsv_prob = 1.0
-        self.flip_prob = 0.5
-
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
-
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
-        from yolox.data import (
-            VOCDetection,
-            TrainTransform,
-            YoloBatchSampler,
-            DataLoader,
-            InfiniteSampler,
-            MosaicDetection,
-            worker_init_reset_seed,
-        )
-        from yolox.utils import (
-            wait_for_the_master,
-            get_local_rank,
-        )
-        local_rank = get_local_rank()
-
-        with wait_for_the_master(local_rank):
-            dataset = VOCDetection(
-                data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-                image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
-                img_size=self.input_size,
-                preproc=TrainTransform(
-                    max_labels=50,
-                    flip_prob=self.flip_prob,
-                    hsv_prob=self.hsv_prob),
-                cache=cache_img,
-            )
-
-        dataset = MosaicDetection(
-            dataset,
-            mosaic=not no_aug,
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                max_labels=120,
-                flip_prob=self.flip_prob,
-                hsv_prob=self.hsv_prob),
-            degrees=self.degrees,
-            translate=self.translate,
-            mosaic_scale=self.mosaic_scale,
-            mixup_scale=self.mixup_scale,
-            shear=self.shear,
-            enable_mixup=self.enable_mixup,
-            mosaic_prob=self.mosaic_prob,
-            mixup_prob=self.mixup_prob,
-        )
-
-        self.dataset = dataset
-
-        if is_distributed:
-            batch_size = batch_size // dist.get_world_size()
-
-        sampler = InfiniteSampler(
-            len(self.dataset), seed=self.seed if self.seed else 0
-        )
-
-        batch_sampler = YoloBatchSampler(
-            sampler=sampler,
-            batch_size=batch_size,
-            drop_last=False,
-            mosaic=not no_aug,
-        )
-
-        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
-        dataloader_kwargs["batch_sampler"] = batch_sampler
-
-        # Make sure each process has different random seed, especially for 'fork' method
-        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
-
-        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
-
-        return train_loader
-
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
-        from yolox.data import VOCDetection, ValTransform
-
-        valdataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'test')],
-            img_size=self.test_size,
-            preproc=ValTransform(legacy=legacy),
-        )
-
-        if is_distributed:
-            batch_size = batch_size // dist.get_world_size()
-            sampler = torch.utils.data.distributed.DistributedSampler(
-                valdataset, shuffle=False
-            )
-        else:
-            sampler = torch.utils.data.SequentialSampler(valdataset)
-
-        dataloader_kwargs = {
-            "num_workers": self.data_num_workers,
-            "pin_memory": True,
-            "sampler": sampler,
-        }
-        dataloader_kwargs["batch_size"] = batch_size
-        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
-
-        return val_loader
-
-    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
-        from yolox.evaluators import VOCEvaluator
-
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
-        evaluator = VOCEvaluator(
-            dataloader=val_loader,
-            img_size=self.test_size,
-            confthre=self.test_conf,
-            nmsthre=self.nmsthre,
-            num_classes=self.num_classes,
-        )
-        return evaluator

From 66a3ead98275506f26d97f5f1612c542cee20a4b Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:21:30 +0800
Subject: [PATCH 06/59] Delete exps/example/yolox_voc_nano directory

---
 .../__pycache__/yolox_voc_nano.cpython-38.pyc | Bin 3838 -> 0 bytes
 exps/example/yolox_voc_nano/yolox_voc_nano.py | 147 ------------------
 2 files changed, 147 deletions(-)
 delete mode 100644 exps/example/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc
 delete mode 100644 exps/example/yolox_voc_nano/yolox_voc_nano.py

diff --git a/exps/example/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc b/exps/example/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc
deleted file mode 100644
index 0b3dc19292760e1ae41ca89f792124e0826856ce..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3838
zcmai1O^h5z74HA(?)lqYI{^oX%}<a9!LDsUL5ZDl@Oqs<ymqh?*@Kc+&s6Pf&vy5W
zt9$I-rRTIpK|;$B<q(Bkak7MR&WSrxA|WJ%C<#S~1G3;kjyNHFuX^_P06kl;U;T8|
zt9sx2>g{Xwy078sJQH>QIj3oVqsrmOLgf-lHiBRp(_^huzV(i->PBLAOucNgI+kj)
z6Q@&|wz(aby8~~>XU49s+RV6`)H-#gTX7?4cAC0&Rbw`D9%#(r^)0hgWBN_4Re6L}
zwdGBB@wtUN*I&7QyQNF>%K8Ye**?#MjbS`o4_KIoED}8$d-yS@m243T*T~+E@s_rt
zGo2X^+>Xg=%wqNf9ah(w!z$2h)?hC4pgF9`d{%{4c}ugjjthHcgn_Ag`k_@ww^3-Z
zY#Acga=p;Dj2&ZF!xo_jCNm%{W<uJzGpOviN_x4!tM9^`Q`(NpENJ#A4P&azfvho&
zj$ru_Ezv*y(|^4B{?mVK_ff&~`?l2Q1@$bo>e9}}BQBJ2(hY(rjq)In)pVQ$y*SJ=
zo=KbWQNAke`w_#-7-rHMh18IZ;wYD{;9;y9@cl{@bA0de^-++9370jVhF!EJ(fXJU
z-^zHr@=%vvP|hC&54C@{7gmP}Ul@$zC@j6TfC(~)Fd4;sp>)>0VGnbr!(-L?(T1pC
zH?t^N4MNi!x}$zy(>H7I`TX9#mU`rby~l&%pj<}D9zZDPY7>J_d!VgV1S2QMdSXn>
z#GF`(HL>9rxSP3sbK(>h#@Q%N;V_T++t#E~P`z@b%`MELQn*{>Nf$ltj#nr^lRofw
zs;o-zrW(9fTRs93GMFOiCK>q+h(g=ZcC|(AjW14gz(voEfw^S}7jV%56a&(}>nZ@4
z%l%It{_79dx;NVW_n#&dyxVR$vXTthIEG{WD2vi857Qo(Mw-g%_rtumx;zxgc_vLh
z%A}jXFKIqbT87lGNj(uyz=Tpf2~BzmFyx;vkby3Qrq}Ru^vyGL^b~7YkH_;D;$bh0
zvlr&~KasZTQCMu5(pGUQJ$l`Eap}f#sos8xhwKpoLW?w<MnuPP3-(l_H0Z5{Y0BfQ
z<0*{Zk22o>$;tPw{&n*~yHz7Vv#3Xat|<u3tMxMRWmqY`LgK3=2nd46OpyKJ41~19
z(J0=K)+!d4vt)7<#?dAZx>1OrXn6vhr**t++>N7@heDjDZ;#FV3{B;B!`@nVm~zpg
zA;(W}|AbDVH#-TT8LsW?Cw1RA30X600Q6@4$Z7Tn6MIh+3ISgM`C}*q<XB5|K-QSp
zfUX1RRuX&SCeFkI)U3QhFwFfejhVX!!MqQ64ypy1l;F+0ZBqf-FTBDp%o@RSueVCQ
zsMk>HC=HY*$_&aZt8M!eAKz%fTw#c}fzT5>CwH}{w25B0`4bp3w*_h&JSpBInHsRR
zzHLmZg;P`uozZMnT9Gxk?Fp^%IX1&)-!UfjqE^(|+yxEcgjrVfqW%kg(kL1Tn-j|=
zj2u~e{rcrap7UNF4O7{;CBi6$NV63HTh818Ru#@~DsB<dU&JNw$}nVH$mYxG3i+Dv
zbvCaJvoPu%%;#Zt#_z2~IUs1AKYR9B>E^V=y)Z6Y&Yc(Eg%2K@eg99t9$oZ5YWIJ4
z@3+5tYT+;K{x2{8JpTLRKey5Or4D=kqw62G`x}qGe$M^P2kricAE>AO=sZ40Jb0qt
z4H6E^rJ3bS`boGR#9<eBe`raU?k>O^8;Fvy&x4HT2-qm;2N`fGD<i>2V%U4MPP2K^
zNz_inB~-<W5VFE}UvTh%hYh4z9OfLHju@pEa4%#PS3@?Ulf1_T@oP!ooGrC%S;_Ve
zuuHU2uIe!{9Kpyem(3_0jdNHC2Fc(GLGV<8OP9<2p;*J@(jOs=6P^skhOAE~D(7u|
zUixL!(oV&<$nJR(3nb2xI7fm`OH@fv3`lD*j8b8e>X#PKafn44aD~JYiK`@DB0<~2
zQD6xiptM*7(;F0%urOxjuFROH;$<2*qfY(M23k!;1mcI(+lVskH_JuT9p{`$Cmjai
zxc`4_zeQ&JfW!@&WR{kQ6&{1N`=RJ(vZfFgl#Ry;q)Emqv&#N&Vv?)_q3LHlL*0dz
zZj`@ItxudP=@x$bZ8p`WzmJ_+$FM*;hVE*O&m3zyiuxZrZcTUfhOv2KkBknjdxVmR
zO^HkuCA|&hC?zQhG6&WUa+*B~>Jus8{-k7%RkjW0;y!giA#Z|0KvpG%<o=*Ks152M
zo$}5#CY7{Nw(c|wXIIA^3@T#PqQYwLm=m`E?Xda<ZTjVG;ea??5QkTIAdbeAI1J?_
zutym0vxWFJEN>a&J9tSe=kV;aP{sGk=I2O%p2S6nB?Lz$TOF^g#Qavv6qM14>m-zK
z2_UUG@hUYaEWb(GK5oU2sH$M=(LxG%;x%e~9YWgqQ1p;<jq@l*0!HU9tB0{Rw|D@&
zJ@%Tt$?uMFf8*+!Hi_FbBLz>(KLD<{LrYX>U2xlhdC~`~(NYMFau+|Qfz<lHz{n5!
zBaF{zIZdAdq#6ol3R3?86ny|q_jN-vK69;xsUZ2OW4rp?A&4EC?Fhsu=cA$^My@!3
zm`bLQ2^B_RGIQIU*htkVF{4{+KWSqY?iZwP0D(OPfv#?kE3khY;=o(FSjEI3JD*Nr
z8h@A2B;DRHUCCDkh&D|!QdIKLs>qp=w8(5VNhV@^Al68P^l9l|`d$pFk>aFPIpz(u
zaQWpxeY8K{@%XP({O|Yv41>!g0x>+JVtg9ekDbcq>|WgNk8kO~?cSe9855=L$e8?q
z@;2m&lsyH(-EkO~Ju2B2#4_Ry5^7@^*;t?E^gSNOL2$^<6Eu$UMfHEi43ta}gkE#~
z6aGd2v}(uyNE8mz!EMBb3GP*b{Ct`U%8W#p`ew_vL%<iDIw??aOKxQ4CEy@?<0+Hg
v)EDa`wL-_fwt*8UgOFnFkc}!WQ#q}Bq1^ZjHb!J~5H47cn6E5IZ_WP#h^gEf

diff --git a/exps/example/yolox_voc_nano/yolox_voc_nano.py b/exps/example/yolox_voc_nano/yolox_voc_nano.py
deleted file mode 100644
index 0bba25ffa..000000000
--- a/exps/example/yolox_voc_nano/yolox_voc_nano.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-# Copyright (c) Megvii, Inc. and its affiliates.
-
-from yolox.data import get_yolox_datadir
-from yolox.exp import Exp as MyExp
-import os
-import random
-import torch.nn as nn
-import torch
-import torch.distributed as dist
-import sys
-sys.path.append(r'D:/YOLOX')
-
-
-class Exp(MyExp):
-    def __init__(self):
-        super(Exp, self).__init__()
-        self.num_classes = 20
-        self.depth = 0.33
-        self.width = 0.25
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
-        self.enable_mixup = False
-
-    def get_model(self, sublinear=False):
-
-        def init_yolo(M):
-            for m in M.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eps = 1e-3
-                    m.momentum = 0.03
-        if "model" not in self.__dict__:
-            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
-            in_channels = [256, 512, 1024]
-            # NANO model use depthwise = True, which is main difference.
-            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
-            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
-            self.model = YOLOX(backbone, head)
-
-        self.model.apply(init_yolo)
-        self.model.head.initialize_biases(1e-2)
-        return self.model
-
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
-        from yolox.data import (
-            VOCDetection,
-            TrainTransform,
-            YoloBatchSampler,
-            DataLoader,
-            InfiniteSampler,
-            MosaicDetection,
-        )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
-        )
-
-        dataset = MosaicDetection(
-            dataset,
-            mosaic=not no_aug,
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=120,
-            ),
-            degrees=self.degrees,
-            translate=self.translate,
-            scale=self.scale,
-            shear=self.shear,
-            perspective=self.perspective,
-            enable_mixup=self.enable_mixup,
-        )
-
-        self.dataset = dataset
-
-        if is_distributed:
-            batch_size = batch_size // dist.get_world_size()
-
-        sampler = InfiniteSampler(
-            len(self.dataset), seed=self.seed if self.seed else 0
-        )
-
-        batch_sampler = YoloBatchSampler(
-            sampler=sampler,
-            batch_size=batch_size,
-            drop_last=False,
-            input_dimension=self.input_size,
-            mosaic=not no_aug,
-        )
-
-        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
-        dataloader_kwargs["batch_sampler"] = batch_sampler
-        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
-
-        return train_loader
-
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
-        from yolox.data import VOCDetection, ValTransform
-
-        valdataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'test')],
-            img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
-        )
-
-        if is_distributed:
-            batch_size = batch_size // dist.get_world_size()
-            sampler = torch.utils.data.distributed.DistributedSampler(
-                valdataset, shuffle=False
-            )
-        else:
-            sampler = torch.utils.data.SequentialSampler(valdataset)
-
-        dataloader_kwargs = {
-            "num_workers": self.data_num_workers,
-            "pin_memory": True,
-            "sampler": sampler,
-        }
-        dataloader_kwargs["batch_size"] = batch_size
-        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
-
-        return val_loader
-
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
-        from yolox.evaluators import VOCEvaluator
-
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
-        evaluator = VOCEvaluator(
-            dataloader=val_loader,
-            img_size=self.test_size,
-            confthre=self.test_conf,
-            nmsthre=self.nmsthre,
-            num_classes=self.num_classes,
-        )
-        return evaluator

From 910165d3e7e6c3c7689c91a18fa8aa084815fd73 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:21:40 +0800
Subject: [PATCH 07/59] Delete exps/example/yolox_voc_nano_adam directory

---
 .../yolox_voc_nano_adam.py                    | 178 ------------------
 1 file changed, 178 deletions(-)
 delete mode 100644 exps/example/yolox_voc_nano_adam/yolox_voc_nano_adam.py

diff --git a/exps/example/yolox_voc_nano_adam/yolox_voc_nano_adam.py b/exps/example/yolox_voc_nano_adam/yolox_voc_nano_adam.py
deleted file mode 100644
index 1663c88f9..000000000
--- a/exps/example/yolox_voc_nano_adam/yolox_voc_nano_adam.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-# Copyright (c) Megvii, Inc. and its affiliates.
-
-import os
-import random
-import torch.nn as nn
-import torch
-import torch.distributed as dist
-import sys
-sys.path.append(r'D:/YOLOX')
-from yolox.exp import Exp as MyExp
-from yolox.data import get_yolox_datadir
-
-
-class Exp(MyExp):
-    def __init__(self):
-        super(Exp, self).__init__()
-        self.num_classes = 1
-        self.depth = 0.33
-        self.width = 0.25
-        self.scale = (0.5, 1.5)
-        self.random_size = (10, 20)
-        self.eps = 1e-8
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
-        self.enable_mixup = False
-
-    def get_model(self, sublinear=False):
-
-        def init_yolo(M):
-            for m in M.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eps = 1e-3
-                    m.momentum = 0.03
-        if "model" not in self.__dict__:
-            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
-            in_channels = [256, 512, 1024]
-            # NANO model use depthwise = True, which is main difference.
-            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
-            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
-            self.model = YOLOX(backbone, head)
-
-        self.model.apply(init_yolo)
-        self.model.head.initialize_biases(1e-2)
-        return self.model
-
-    def get_optimizer(self, batch_size):
-        if "optimizer" not in self.__dict__:
-            if self.warmup_epochs > 0:
-                lr = self.warmup_lr
-            else:
-                lr = self.basic_lr_per_img * batch_size
-
-            pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
-
-            for k, v in self.model.named_modules():
-                if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
-                    pg2.append(v.bias)  # biases
-                if isinstance(v, nn.BatchNorm2d) or "bn" in k:
-                    pg0.append(v.weight)  # no decay
-                elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
-                    pg1.append(v.weight)  # apply decay
-
-            optimizer = torch.optim.Adam(
-                pg0, lr=lr, eps=self.eps, amsgrad=False
-            )
-            optimizer.add_param_group(
-                {"params": pg1, "weight_decay": self.weight_decay}
-            )  # add pg1 with weight_decay
-            optimizer.add_param_group({"params": pg2})
-            self.optimizer = optimizer
-
-        return self.optimizer
-    
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
-        from yolox.data import (
-            VOCDetection,
-            TrainTransform,
-            YoloBatchSampler,
-            DataLoader,
-            InfiniteSampler,
-            MosaicDetection,
-        )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
-        )
-
-        dataset = MosaicDetection(
-            dataset,
-            mosaic=not no_aug,
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=120,
-            ),
-            degrees=self.degrees,
-            translate=self.translate,
-            scale=self.scale,
-            shear=self.shear,
-            perspective=self.perspective,
-            enable_mixup=self.enable_mixup,
-        )
-
-        self.dataset = dataset
-
-        if is_distributed:
-            batch_size = batch_size // dist.get_world_size()
-
-        sampler = InfiniteSampler(
-            len(self.dataset), seed=self.seed if self.seed else 0
-        )
-
-        batch_sampler = YoloBatchSampler(
-            sampler=sampler,
-            batch_size=batch_size,
-            drop_last=False,
-            input_dimension=self.input_size,
-            mosaic=not no_aug,
-        )
-
-        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
-        dataloader_kwargs["batch_sampler"] = batch_sampler
-        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
-
-        return train_loader
-
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
-        from yolox.data import VOCDetection, ValTransform
-
-        valdataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'test')],
-            img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
-        )
-
-        if is_distributed:
-            batch_size = batch_size // dist.get_world_size()
-            sampler = torch.utils.data.distributed.DistributedSampler(
-                valdataset, shuffle=False
-            )
-        else:
-            sampler = torch.utils.data.SequentialSampler(valdataset)
-
-        dataloader_kwargs = {
-            "num_workers": self.data_num_workers,
-            "pin_memory": True,
-            "sampler": sampler,
-        }
-        dataloader_kwargs["batch_size"] = batch_size
-        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
-
-        return val_loader
-
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
-        from yolox.evaluators import VOCEvaluator
-
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
-        evaluator = VOCEvaluator(
-            dataloader=val_loader,
-            img_size=self.test_size,
-            confthre=self.test_conf,
-            nmsthre=self.nmsthre,
-            num_classes=self.num_classes,
-        )
-        return evaluator

From 0827b558cf4f125a9b79c97d2da6c2926e0e27d8 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:21:47 +0800
Subject: [PATCH 08/59] Delete exps/example/yolox_voc_tiny directory

---
 exps/example/yolox_voc_tiny/yolox_voc_tiny.py | 146 ------------------
 1 file changed, 146 deletions(-)
 delete mode 100644 exps/example/yolox_voc_tiny/yolox_voc_tiny.py

diff --git a/exps/example/yolox_voc_tiny/yolox_voc_tiny.py b/exps/example/yolox_voc_tiny/yolox_voc_tiny.py
deleted file mode 100644
index 499b2a59a..000000000
--- a/exps/example/yolox_voc_tiny/yolox_voc_tiny.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-# Copyright (c) Megvii, Inc. and its affiliates.
-
-import os
-import random
-import torch.nn as nn
-import torch
-import torch.distributed as dist
-import sys
-sys.path.append(r'D:/YOLOX')
-from yolox.exp import Exp as MyExp
-from yolox.data import get_yolox_datadir
-
-
-class Exp(MyExp):
-    def __init__(self):
-        super(Exp, self).__init__()
-        self.depth = 0.33
-        self.width = 0.375
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
-
-    def get_model(self, sublinear=False):
-
-        def init_yolo(M):
-            for m in M.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eps = 1e-3
-                    m.momentum = 0.03
-
-        if "model" not in self.__dict__:
-            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
-            in_channels = [256, 512, 1024]
-            # NANO model use depthwise = True, which is main difference.
-            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
-            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
-            self.model = YOLOX(backbone, head)
-
-        self.model.apply(init_yolo)
-        self.model.head.initialize_biases(1e-2)
-        return self.model
-
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
-        from yolox.data import (
-            VOCDetection,
-            TrainTransform,
-            YoloBatchSampler,
-            DataLoader,
-            InfiniteSampler,
-            MosaicDetection,
-        )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
-        )
-
-        dataset = MosaicDetection(
-            dataset,
-            mosaic=not no_aug,
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=120,
-            ),
-            degrees=self.degrees,
-            translate=self.translate,
-            scale=self.scale,
-            shear=self.shear,
-            perspective=self.perspective,
-            enable_mixup=self.enable_mixup,
-        )
-
-        self.dataset = dataset
-
-        if is_distributed:
-            batch_size = batch_size // dist.get_world_size()
-
-        sampler = InfiniteSampler(
-            len(self.dataset), seed=self.seed if self.seed else 0
-        )
-
-        batch_sampler = YoloBatchSampler(
-            sampler=sampler,
-            batch_size=batch_size,
-            drop_last=False,
-            input_dimension=self.input_size,
-            mosaic=not no_aug,
-        )
-
-        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
-        dataloader_kwargs["batch_sampler"] = batch_sampler
-        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
-
-        return train_loader
-
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
-        from yolox.data import VOCDetection, ValTransform
-
-        valdataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'test')],
-            img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
-        )
-
-        if is_distributed:
-            batch_size = batch_size // dist.get_world_size()
-            sampler = torch.utils.data.distributed.DistributedSampler(
-                valdataset, shuffle=False
-            )
-        else:
-            sampler = torch.utils.data.SequentialSampler(valdataset)
-
-        dataloader_kwargs = {
-            "num_workers": self.data_num_workers,
-            "pin_memory": True,
-            "sampler": sampler,
-        }
-        dataloader_kwargs["batch_size"] = batch_size
-        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
-
-        return val_loader
-
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
-        from yolox.evaluators import VOCEvaluator
-
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
-        evaluator = VOCEvaluator(
-            dataloader=val_loader,
-            img_size=self.test_size,
-            confthre=self.test_conf,
-            nmsthre=self.nmsthre,
-            num_classes=self.num_classes,
-        )
-        return evaluator

From 4175e9e8b07cf6845d56d62eb8a9636bce672fc0 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:24:20 +0800
Subject: [PATCH 09/59] Add files via upload

---
 .../__pycache__/yolox_voc_nano.cpython-38.pyc | Bin 0 -> 3838 bytes
 .../yolox_voc_nano/yolox_voc_nano.py          | 147 +++++++++++++++
 .../yolox_voc_nano_adam.py                    | 178 ++++++++++++++++++
 .../__pycache__/yolox_voc_s.cpython-38.pyc    | Bin 0 -> 3045 bytes
 .../voc_format/yolox_voc_s/yolox_voc_s.py     | 123 ++++++++++++
 .../yolox_voc_tiny/yolox_voc_tiny.py          | 146 ++++++++++++++
 6 files changed, 594 insertions(+)
 create mode 100644 exps/example/custom/voc_format/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc
 create mode 100644 exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
 create mode 100644 exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py
 create mode 100644 exps/example/custom/voc_format/yolox_voc_s/__pycache__/yolox_voc_s.cpython-38.pyc
 create mode 100644 exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py
 create mode 100644 exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py

diff --git a/exps/example/custom/voc_format/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc b/exps/example/custom/voc_format/yolox_voc_nano/__pycache__/yolox_voc_nano.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b3dc19292760e1ae41ca89f792124e0826856ce
GIT binary patch
literal 3838
zcmai1O^h5z74HA(?)lqYI{^oX%}<a9!LDsUL5ZDl@Oqs<ymqh?*@Kc+&s6Pf&vy5W
zt9$I-rRTIpK|;$B<q(Bkak7MR&WSrxA|WJ%C<#S~1G3;kjyNHFuX^_P06kl;U;T8|
zt9sx2>g{Xwy078sJQH>QIj3oVqsrmOLgf-lHiBRp(_^huzV(i->PBLAOucNgI+kj)
z6Q@&|wz(aby8~~>XU49s+RV6`)H-#gTX7?4cAC0&Rbw`D9%#(r^)0hgWBN_4Re6L}
zwdGBB@wtUN*I&7QyQNF>%K8Ye**?#MjbS`o4_KIoED}8$d-yS@m243T*T~+E@s_rt
zGo2X^+>Xg=%wqNf9ah(w!z$2h)?hC4pgF9`d{%{4c}ugjjthHcgn_Ag`k_@ww^3-Z
zY#Acga=p;Dj2&ZF!xo_jCNm%{W<uJzGpOviN_x4!tM9^`Q`(NpENJ#A4P&azfvho&
zj$ru_Ezv*y(|^4B{?mVK_ff&~`?l2Q1@$bo>e9}}BQBJ2(hY(rjq)In)pVQ$y*SJ=
zo=KbWQNAke`w_#-7-rHMh18IZ;wYD{;9;y9@cl{@bA0de^-++9370jVhF!EJ(fXJU
z-^zHr@=%vvP|hC&54C@{7gmP}Ul@$zC@j6TfC(~)Fd4;sp>)>0VGnbr!(-L?(T1pC
zH?t^N4MNi!x}$zy(>H7I`TX9#mU`rby~l&%pj<}D9zZDPY7>J_d!VgV1S2QMdSXn>
z#GF`(HL>9rxSP3sbK(>h#@Q%N;V_T++t#E~P`z@b%`MELQn*{>Nf$ltj#nr^lRofw
zs;o-zrW(9fTRs93GMFOiCK>q+h(g=ZcC|(AjW14gz(voEfw^S}7jV%56a&(}>nZ@4
z%l%It{_79dx;NVW_n#&dyxVR$vXTthIEG{WD2vi857Qo(Mw-g%_rtumx;zxgc_vLh
z%A}jXFKIqbT87lGNj(uyz=Tpf2~BzmFyx;vkby3Qrq}Ru^vyGL^b~7YkH_;D;$bh0
zvlr&~KasZTQCMu5(pGUQJ$l`Eap}f#sos8xhwKpoLW?w<MnuPP3-(l_H0Z5{Y0BfQ
z<0*{Zk22o>$;tPw{&n*~yHz7Vv#3Xat|<u3tMxMRWmqY`LgK3=2nd46OpyKJ41~19
z(J0=K)+!d4vt)7<#?dAZx>1OrXn6vhr**t++>N7@heDjDZ;#FV3{B;B!`@nVm~zpg
zA;(W}|AbDVH#-TT8LsW?Cw1RA30X600Q6@4$Z7Tn6MIh+3ISgM`C}*q<XB5|K-QSp
zfUX1RRuX&SCeFkI)U3QhFwFfejhVX!!MqQ64ypy1l;F+0ZBqf-FTBDp%o@RSueVCQ
zsMk>HC=HY*$_&aZt8M!eAKz%fTw#c}fzT5>CwH}{w25B0`4bp3w*_h&JSpBInHsRR
zzHLmZg;P`uozZMnT9Gxk?Fp^%IX1&)-!UfjqE^(|+yxEcgjrVfqW%kg(kL1Tn-j|=
zj2u~e{rcrap7UNF4O7{;CBi6$NV63HTh818Ru#@~DsB<dU&JNw$}nVH$mYxG3i+Dv
zbvCaJvoPu%%;#Zt#_z2~IUs1AKYR9B>E^V=y)Z6Y&Yc(Eg%2K@eg99t9$oZ5YWIJ4
z@3+5tYT+;K{x2{8JpTLRKey5Or4D=kqw62G`x}qGe$M^P2kricAE>AO=sZ40Jb0qt
z4H6E^rJ3bS`boGR#9<eBe`raU?k>O^8;Fvy&x4HT2-qm;2N`fGD<i>2V%U4MPP2K^
zNz_inB~-<W5VFE}UvTh%hYh4z9OfLHju@pEa4%#PS3@?Ulf1_T@oP!ooGrC%S;_Ve
zuuHU2uIe!{9Kpyem(3_0jdNHC2Fc(GLGV<8OP9<2p;*J@(jOs=6P^skhOAE~D(7u|
zUixL!(oV&<$nJR(3nb2xI7fm`OH@fv3`lD*j8b8e>X#PKafn44aD~JYiK`@DB0<~2
zQD6xiptM*7(;F0%urOxjuFROH;$<2*qfY(M23k!;1mcI(+lVskH_JuT9p{`$Cmjai
zxc`4_zeQ&JfW!@&WR{kQ6&{1N`=RJ(vZfFgl#Ry;q)Emqv&#N&Vv?)_q3LHlL*0dz
zZj`@ItxudP=@x$bZ8p`WzmJ_+$FM*;hVE*O&m3zyiuxZrZcTUfhOv2KkBknjdxVmR
zO^HkuCA|&hC?zQhG6&WUa+*B~>Jus8{-k7%RkjW0;y!giA#Z|0KvpG%<o=*Ks152M
zo$}5#CY7{Nw(c|wXIIA^3@T#PqQYwLm=m`E?Xda<ZTjVG;ea??5QkTIAdbeAI1J?_
zutym0vxWFJEN>a&J9tSe=kV;aP{sGk=I2O%p2S6nB?Lz$TOF^g#Qavv6qM14>m-zK
z2_UUG@hUYaEWb(GK5oU2sH$M=(LxG%;x%e~9YWgqQ1p;<jq@l*0!HU9tB0{Rw|D@&
zJ@%Tt$?uMFf8*+!Hi_FbBLz>(KLD<{LrYX>U2xlhdC~`~(NYMFau+|Qfz<lHz{n5!
zBaF{zIZdAdq#6ol3R3?86ny|q_jN-vK69;xsUZ2OW4rp?A&4EC?Fhsu=cA$^My@!3
zm`bLQ2^B_RGIQIU*htkVF{4{+KWSqY?iZwP0D(OPfv#?kE3khY;=o(FSjEI3JD*Nr
z8h@A2B;DRHUCCDkh&D|!QdIKLs>qp=w8(5VNhV@^Al68P^l9l|`d$pFk>aFPIpz(u
zaQWpxeY8K{@%XP({O|Yv41>!g0x>+JVtg9ekDbcq>|WgNk8kO~?cSe9855=L$e8?q
z@;2m&lsyH(-EkO~Ju2B2#4_Ry5^7@^*;t?E^gSNOL2$^<6Eu$UMfHEi43ta}gkE#~
z6aGd2v}(uyNE8mz!EMBb3GP*b{Ct`U%8W#p`ew_vL%<iDIw??aOKxQ4CEy@?<0+Hg
v)EDa`wL-_fwt*8UgOFnFkc}!WQ#q}Bq1^ZjHb!J~5H47cn6E5IZ_WP#h^gEf

literal 0
HcmV?d00001

diff --git a/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
new file mode 100644
index 000000000..0bba25ffa
--- /dev/null
+++ b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+import os
+import random
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+import sys
+sys.path.append(r'D:/YOLOX')
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 20
+        self.depth = 0.33
+        self.width = 0.25
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'test')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator
diff --git a/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py b/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py
new file mode 100644
index 000000000..1663c88f9
--- /dev/null
+++ b/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import random
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+import sys
+sys.path.append(r'D:/YOLOX')
+from yolox.exp import Exp as MyExp
+from yolox.data import get_yolox_datadir
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 1
+        self.depth = 0.33
+        self.width = 0.25
+        self.scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.eps = 1e-8
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
+
+    def get_optimizer(self, batch_size):
+        if "optimizer" not in self.__dict__:
+            if self.warmup_epochs > 0:
+                lr = self.warmup_lr
+            else:
+                lr = self.basic_lr_per_img * batch_size
+
+            pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
+
+            for k, v in self.model.named_modules():
+                if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+                    pg2.append(v.bias)  # biases
+                if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+                    pg0.append(v.weight)  # no decay
+                elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+                    pg1.append(v.weight)  # apply decay
+
+            optimizer = torch.optim.Adam(
+                pg0, lr=lr, eps=self.eps, amsgrad=False
+            )
+            optimizer.add_param_group(
+                {"params": pg1, "weight_decay": self.weight_decay}
+            )  # add pg1 with weight_decay
+            optimizer.add_param_group({"params": pg2})
+            self.optimizer = optimizer
+
+        return self.optimizer
+    
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'test')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator
diff --git a/exps/example/custom/voc_format/yolox_voc_s/__pycache__/yolox_voc_s.cpython-38.pyc b/exps/example/custom/voc_format/yolox_voc_s/__pycache__/yolox_voc_s.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cea337ed06e250e2ca884a323327deb00f8e4824
GIT binary patch
literal 3045
zcmai0ON<;x8Sd(L_w>v@oR<xW0rAj)*tN||5SvZ##t_L5B0GVMP+GlHwX;3$$E3Pv
zy|#KzHp(Gdjwpvn?23~m#F-O!q~wSYC7~d3K&#wx#Dwtu)w2)#06kNG*Yp4Xs{i}H
zuGhQW(7^NYySF!17YySc)HwOrXuO0{{Q`m;oMpzKeX{}6ZF6Aix0Txio3&$3?hZU%
zH#0vE2BEH7StpMM5i@>laGN_14DN{VzBLFqyJ_^@16VMw_nER@*_q(AG!k`ux6I0&
zm?w3@Q#qs=Cm(CxsD6NoFvz2U`Hr#4IOFC6e_-)YTVey7ceu;F2W;T*i2FQ1&3(tP
zje!T-U1>tWke$#?bq9q^R6l{JjhZ#aeRI#;H?SSlV~d-RHn$+1+8uj)zLr59?z4Ty
z?bF7d&mGj<(+1{rxB+9{^@FFNZ*=yTAHVj87yi69LIcm=*AB?0Vk?r~{YW|0bRwj7
zP5E)07HJ*Fs#8q!c$g(sB`W1`F{!tdb0_6^nPsKyNkSdfBui`MOOa%{1M4@_Oklk)
zb|!I=<N`-*S0dYd$W#!w8sqq(@voOxx5`|sj;C3gtge^CX)cPoT7C1z)f+!w#o`Jg
z$tRguZG*F24v%YbwKBOYeQcnMQfY9p9&`2YUWCZ59PZ@Gkid2L0DMM-av7!i1cE}C
z89AHTxjA!kYv$(m%*&mbpS!bw8I4_gV}B24ec#~LzDa=|As*vS<4~w6$h!c*u00Ez
zpa~l*LJW>ZJ2Z-RgwjRnp)8;*qAc-fH^izCF<Wj-`Ch~7Q+rSC8)u9eYyA3Y%vrt<
z1dN}Nzb9Fk@V2{a&N_|TbQ;D<TZb(2g<WSxHlF2+eCd62)@`Du%a<=2jfdXlP1baO
z#b&*x2V|VO-iF>)(Hl1|uZvm?>$EIX@0Lu`0-~rkOPQ<1H<1-TO6uX(O&tiS!gXZG
z)iU8is)bjJO~Q&e99+0oR!KTM))zoK7u&bfx^F7`!nt$bQ+`b*wv(*wIe$T(MGPKV
zqwp`kom>h(S{wao`;Wglv--ES(XTJRoBi|AU)M1B{D8ms(Txw+Mt2Xsdfxxt2Wz7b
zKhV$G!38YuTPl#FjW`$ZTv=7kRhTC`ah7a|ta@lGpAtJpW~(sGlaYukQCG@O^HE%-
z_k{8$QcPqyJlG*^fpT-b6Zv&C<u@Rd$HhnrWI})q6jheg0^lDe8NIedB2@=DR88n4
zw}pyCk!&Ch@^oiP9D@W9{7T<ZUUhhq8+5L%`GCuE0&lBYEu_U{TEkT&Lxq^gZ^HU5
z9F|VUJEgo0D=M6%1$LX4@~-O66}5W%&#6$S%L;8-K1&;Tj>KsaXGnaG#OFy+9!Sa`
zWsl3Wki<baA?*R3i(H`zt0c~mI8TBOTz(4zX95#ANoDgCuJ?#X;bF$xU0InZ<hN<!
zqCS4&llcbv3v!v%e3wRhX@&h(wM;jrwcyGv$~c*h{*Ui#wEjCJUL++;WFphRjBnpb
z<fu}ShFaWq%BN|n7SGW91#(uq)s7QWRs#rweIYP4li18`f1ld_b~{wJ@jL2ss5Sq6
z;x4(S%_7rezR~;4wdb>F{E_QN%x69G-YH_GC+t4PO~hhEX_}kfL*;R9(p+SX?LB-0
z4!LMV?BIckG#Blf+y@W2;F7n%C19<VOX_gk8AszT*vI7oxXPS)MX&AMTWH*UHY1i;
z<ekRj(fiiSZ@@pid(oJ$Txwh}hz|w{ng9&an=^>1qXZgBvLo7%&%^V+DPO=#*|k7q
zzlf^*QrrDys(*#V*C4JSIlO9Xy1AK&TYXD@p9bhF+s2jv(qEPrsY7G=RjM7~R$ii}
zhHXHGH1On0)cG=ma_UkJ@s*m^X@*Z0#anewVr_Z-7<h*?wscdxJ;kReO|p5Hq=4w;
z>4(R_mFr}}BfB8EWA{`DdLvUvjdmATXd?CgFEEPZ@z*fFqKq@xA|Tb%Fw>Cw51<$V
zXe?x=VSeV@JxfFKQ`hm?@(GBY(DoR_JSe52Ax7tX3^BmV7@G*Z*;w4#wPsFZHxByf
z^JSw@-Q_kuG;ZSp1kM};On*jRf&bG42i|A$CKQA0!sDdWRI2=8S!~u@5{z5q6;)(F
zQ$4lVk`}qGl~nR)7?5v}xC#N@rTEf%NohJ?_e3<b;dbTgdeM=-C-c8cd3ZEB!Q_f4
zz(5W!>KvbE_9NH3w{)1dNAvp(xYd83ZcKE`!Hp-3<Gkckefx>x`0Z(uwIlkTBI$D1
zZs;x4RCj(C8j387;}ia!qIpD8`bI`ruZR?U7J1>N@T~5|O_Bzbi;od6b9s##zDD~>
zQ798-scoG=yZk9;>FiL!{J3`}THV08b{Cs$^NiBzggeKa>0fNO_d~kGRy_zG#0I}`
IXeicy0sid~;{X5v

literal 0
HcmV?d00001

diff --git a/exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py b/exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py
new file mode 100644
index 000000000..5f108b42b
--- /dev/null
+++ b/exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py
@@ -0,0 +1,123 @@
+# encoding: utf-8
+import os
+import random
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+from yolox.exp import Exp as MyExp
+from yolox.data import get_yolox_datadir
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.num_classes = 20
+        self.depth = 0.33
+        self.width = 0.50
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'test')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator
diff --git a/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py b/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
new file mode 100644
index 000000000..499b2a59a
--- /dev/null
+++ b/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import random
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+import sys
+sys.path.append(r'D:/YOLOX')
+from yolox.exp import Exp as MyExp
+from yolox.data import get_yolox_datadir
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.375
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            VOCDetection,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+
+        sampler = InfiniteSampler(
+            len(self.dataset), seed=self.seed if self.seed else 0
+        )
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import VOCDetection, ValTransform
+
+        valdataset = VOCDetection(
+            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+            image_sets=[('2007', 'test')],
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evaluators import VOCEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = VOCEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+        )
+        return evaluator

From a0499dfb8c04a0f2a280aeca6f0a4450cb856c6b Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:25:48 +0800
Subject: [PATCH 10/59] Add files via upload

---
 exps/example/custom/coco_format/nano.py    | 48 ++++++++++++++++++++++
 exps/example/custom/coco_format/yolox_s.py | 25 +++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 exps/example/custom/coco_format/nano.py
 create mode 100644 exps/example/custom/coco_format/yolox_s.py

diff --git a/exps/example/custom/coco_format/nano.py b/exps/example/custom/coco_format/nano.py
new file mode 100644
index 000000000..fb10626db
--- /dev/null
+++ b/exps/example/custom/coco_format/nano.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch.nn as nn
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.25
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.enable_mixup = False
+
+        # Define yourself dataset path
+        self.data_dir = "datasets/coco128"
+        self.train_ann = "instances_train2017.json"
+        self.val_ann = "instances_val2017.json"
+
+        self.num_classes = 71
+
+    def get_model(self, sublinear=False):
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+        if "model" not in self.__dict__:
+            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+            in_channels = [256, 512, 1024]
+            # NANO model use depthwise = True, which is main difference.
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
diff --git a/exps/example/custom/coco_format/yolox_s.py b/exps/example/custom/coco_format/yolox_s.py
new file mode 100644
index 000000000..2f0b0a5f7
--- /dev/null
+++ b/exps/example/custom/coco_format/yolox_s.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+    def __init__(self):
+        super(Exp, self).__init__()
+        self.depth = 0.33
+        self.width = 0.50
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+        # Define yourself dataset path
+        self.data_dir = "datasets/coco128"
+        self.train_ann = "instances_train2017.json"
+        self.val_ann = "instances_val2017.json"
+
+        self.num_classes = 71
+
+        self.max_epoch = 300
+        self.data_num_workers = 4
+        self.eval_interval = 1

From fae9df388617672602c4fc8ee9fdcd2eb2f03c64 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:26:28 +0800
Subject: [PATCH 11/59] Delete yolox_nano.py

---
 exps/example/custom/yolox_nano.py | 48 -------------------------------
 1 file changed, 48 deletions(-)
 delete mode 100644 exps/example/custom/yolox_nano.py

diff --git a/exps/example/custom/yolox_nano.py b/exps/example/custom/yolox_nano.py
deleted file mode 100644
index fb10626db..000000000
--- a/exps/example/custom/yolox_nano.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-# Copyright (c) Megvii, Inc. and its affiliates.
-
-import os
-
-import torch.nn as nn
-
-from yolox.exp import Exp as MyExp
-
-
-class Exp(MyExp):
-    def __init__(self):
-        super(Exp, self).__init__()
-        self.depth = 0.33
-        self.width = 0.25
-        self.input_size = (416, 416)
-        self.mosaic_scale = (0.5, 1.5)
-        self.random_size = (10, 20)
-        self.test_size = (416, 416)
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
-        self.enable_mixup = False
-
-        # Define yourself dataset path
-        self.data_dir = "datasets/coco128"
-        self.train_ann = "instances_train2017.json"
-        self.val_ann = "instances_val2017.json"
-
-        self.num_classes = 71
-
-    def get_model(self, sublinear=False):
-
-        def init_yolo(M):
-            for m in M.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eps = 1e-3
-                    m.momentum = 0.03
-        if "model" not in self.__dict__:
-            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
-            in_channels = [256, 512, 1024]
-            # NANO model use depthwise = True, which is main difference.
-            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
-            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
-            self.model = YOLOX(backbone, head)
-
-        self.model.apply(init_yolo)
-        self.model.head.initialize_biases(1e-2)
-        return self.model

From 1af2ef84888c9b293eb3576298cda7f060d11187 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:26:39 +0800
Subject: [PATCH 12/59] Delete yolox_s.py

---
 exps/example/custom/yolox_s.py | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 exps/example/custom/yolox_s.py

diff --git a/exps/example/custom/yolox_s.py b/exps/example/custom/yolox_s.py
deleted file mode 100644
index 2f0b0a5f7..000000000
--- a/exps/example/custom/yolox_s.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding:utf-8 -*-
-# Copyright (c) Megvii, Inc. and its affiliates.
-import os
-
-from yolox.exp import Exp as MyExp
-
-
-class Exp(MyExp):
-    def __init__(self):
-        super(Exp, self).__init__()
-        self.depth = 0.33
-        self.width = 0.50
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
-
-        # Define yourself dataset path
-        self.data_dir = "datasets/coco128"
-        self.train_ann = "instances_train2017.json"
-        self.val_ann = "instances_val2017.json"
-
-        self.num_classes = 71
-
-        self.max_epoch = 300
-        self.data_num_workers = 4
-        self.eval_interval = 1

From 6ed55a457d47d7283a2502c8b5c9812c299c22e0 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:28:41 +0800
Subject: [PATCH 13/59] Rename nano.py to yolox_nano.py

---
 exps/example/custom/coco_format/{nano.py => yolox_nano.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename exps/example/custom/coco_format/{nano.py => yolox_nano.py} (100%)

diff --git a/exps/example/custom/coco_format/nano.py b/exps/example/custom/coco_format/yolox_nano.py
similarity index 100%
rename from exps/example/custom/coco_format/nano.py
rename to exps/example/custom/coco_format/yolox_nano.py

From d162dc8f0ca41dc8cec8b6b86a450a0e85a37fbd Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:30:21 +0800
Subject: [PATCH 14/59] Update demo.py

---
 tools/demo.py | 96 +++++++++++++++++++++++----------------------------
 1 file changed, 43 insertions(+), 53 deletions(-)

diff --git a/tools/demo.py b/tools/demo.py
index b16598d5f..1e505a3aa 100644
--- a/tools/demo.py
+++ b/tools/demo.py
@@ -2,20 +2,21 @@
 # -*- coding:utf-8 -*-
 # Copyright (c) Megvii, Inc. and its affiliates.
 
-import argparse
-import os
-import time
 from loguru import logger
 
 import cv2
 
 import torch
 
-from yolox.data.data_augment import ValTransform
-from yolox.data.datasets import COCO_CLASSES
+from yolox.data.data_augment import preproc
+from yolox.data.datasets import COCO_CLASSES, VOC_CLASSES
 from yolox.exp import get_exp
 from yolox.utils import fuse_model, get_model_info, postprocess, vis
 
+import argparse
+import os
+import time
+
 IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]
 
 
@@ -43,7 +44,7 @@ def make_parser():
         "--exp_file",
         default=None,
         type=str,
-        help="please input your experiment description file",
+        help="pls input your expriment description file",
     )
     parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
     parser.add_argument(
@@ -52,8 +53,8 @@ def make_parser():
         type=str,
         help="device to run our model, can either be cpu or gpu",
     )
-    parser.add_argument("--conf", default=0.3, type=float, help="test conf")
-    parser.add_argument("--nms", default=0.3, type=float, help="test nms threshold")
+    parser.add_argument("--conf", default=None, type=float, help="test conf")
+    parser.add_argument("--nms", default=None, type=float, help="test nms threshold")
     parser.add_argument("--tsize", default=None, type=int, help="test img size")
     parser.add_argument(
         "--fp16",
@@ -62,13 +63,6 @@ def make_parser():
         action="store_true",
         help="Adopting mix precision evaluating.",
     )
-    parser.add_argument(
-        "--legacy",
-        dest="legacy",
-        default=False,
-        action="store_true",
-        help="To be compatible with older versions",
-    )
     parser.add_argument(
         "--fuse",
         dest="fuse",
@@ -102,12 +96,10 @@ def __init__(
         self,
         model,
         exp,
-        cls_names=COCO_CLASSES,
+        cls_names=VOC_CLASSES,
         trt_file=None,
         decoder=None,
         device="cpu",
-        fp16=False,
-        legacy=False,
     ):
         self.model = model
         self.cls_names = cls_names
@@ -117,8 +109,6 @@ def __init__(
         self.nmsthre = exp.nmsthre
         self.test_size = exp.test_size
         self.device = device
-        self.fp16 = fp16
-        self.preproc = ValTransform(legacy=legacy)
         if trt_file is not None:
             from torch2trt import TRTModule
 
@@ -128,6 +118,8 @@ def __init__(
             x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
             self.model(x)
             self.model = model_trt
+        self.rgb_means = (0.485, 0.456, 0.406)
+        self.std = (0.229, 0.224, 0.225)
 
     def inference(self, img):
         img_info = {"id": 0}
@@ -142,25 +134,23 @@ def inference(self, img):
         img_info["width"] = width
         img_info["raw_img"] = img
 
-        ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
+        img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
         img_info["ratio"] = ratio
-
-        img, _ = self.preproc(img, None, self.test_size)
         img = torch.from_numpy(img).unsqueeze(0)
-        img = img.float()
         if self.device == "gpu":
             img = img.cuda()
-            if self.fp16:
-                img = img.half()  # to FP16
 
         with torch.no_grad():
             t0 = time.time()
             outputs = self.model(img)
+            #print(type(outputs)) # torch.Tensor
+            print(len(outputs))  # 1
+            print(outputs.shape)  # (1,8400,6)
+            print(outputs.tolist())  # print complete list.
             if self.decoder is not None:
                 outputs = self.decoder(outputs, dtype=outputs.type())
             outputs = postprocess(
-                outputs, self.num_classes, self.confthre,
-                self.nmsthre, class_agnostic=True
+                outputs, self.num_classes, self.confthre, self.nmsthre
             )
             logger.info("Infer time: {:.4f}s".format(time.time() - t0))
         return outputs, img_info
@@ -169,6 +159,17 @@ def visual(self, output, img_info, cls_conf=0.35):
         ratio = img_info["ratio"]
         img = img_info["raw_img"]
         if output is None:
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            class_count = {}
+            class_AP = {}
+            for i in self.cls_names:
+                class_count[i] = 0
+                class_AP[i] = 0.0
+            line = 0
+            for k in class_count:  
+                cv2.putText(img, str(k)+": "+str(class_count[k]), (15,25+line), font, 0.8, (0, 255, 255), thickness=2)
+                cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (15,50+line), font, 0.8, (0, 255, 255), thickness=2)
+                line = line+50
             return img
         output = output.cpu()
 
@@ -179,7 +180,6 @@ def visual(self, output, img_info, cls_conf=0.35):
 
         cls = output[:, 6]
         scores = output[:, 4] * output[:, 5]
-
         vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
         return vis_res
 
@@ -211,19 +211,18 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
     width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  # float
     height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  # float
     fps = cap.get(cv2.CAP_PROP_FPS)
-    if args.save_result:
-        save_folder = os.path.join(
-            vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
-        )
-        os.makedirs(save_folder, exist_ok=True)
-        if args.demo == "video":
-            save_path = os.path.join(save_folder, os.path.basename(args.path))
-        else:
-            save_path = os.path.join(save_folder, "camera.mp4")
-        logger.info(f"video save_path is {save_path}")
-        vid_writer = cv2.VideoWriter(
-            save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
-        )
+    save_folder = os.path.join(
+        vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+    )
+    os.makedirs(save_folder, exist_ok=True)
+    if args.demo == "video":
+        save_path = os.path.join(save_folder, args.path.split("/")[-1])
+    else:
+        save_path = os.path.join(save_folder, "camera.mp4")
+    logger.info(f"video save_path is {save_path}")
+    vid_writer = cv2.VideoWriter(
+        save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
+    )
     while True:
         ret_val, frame = cap.read()
         if ret_val:
@@ -231,9 +230,6 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
             result_frame = predictor.visual(outputs[0], img_info, predictor.confthre)
             if args.save_result:
                 vid_writer.write(result_frame)
-            else:
-                cv2.namedWindow("yolox", cv2.WINDOW_NORMAL)
-                cv2.imshow("yolox", result_frame)
             ch = cv2.waitKey(1)
             if ch == 27 or ch == ord("q") or ch == ord("Q"):
                 break
@@ -248,7 +244,6 @@ def main(exp, args):
     file_name = os.path.join(exp.output_dir, args.experiment_name)
     os.makedirs(file_name, exist_ok=True)
 
-    vis_folder = None
     if args.save_result:
         vis_folder = os.path.join(file_name, "vis_res")
         os.makedirs(vis_folder, exist_ok=True)
@@ -270,13 +265,11 @@ def main(exp, args):
 
     if args.device == "gpu":
         model.cuda()
-        if args.fp16:
-            model.half()  # to FP16
     model.eval()
 
     if not args.trt:
         if args.ckpt is None:
-            ckpt_file = os.path.join(file_name, "best_ckpt.pth")
+            ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
         else:
             ckpt_file = args.ckpt
         logger.info("loading checkpoint")
@@ -302,10 +295,7 @@ def main(exp, args):
         trt_file = None
         decoder = None
 
-    predictor = Predictor(
-        model, exp, COCO_CLASSES, trt_file, decoder,
-        args.device, args.fp16, args.legacy,
-    )
+    predictor = Predictor(model, exp, VOC_CLASSES, trt_file, decoder, args.device)
     current_time = time.localtime()
     if args.demo == "image":
         image_demo(predictor, vis_folder, args.path, current_time, args.save_result)

From 0ae6ad6dc94e38ae11816efd524c9ca290500edd Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:34:06 +0800
Subject: [PATCH 15/59] Update __init__.py

---
 yolox/data/datasets/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/yolox/data/datasets/__init__.py b/yolox/data/datasets/__init__.py
index dee2c9f48..6ea2be14f 100644
--- a/yolox/data/datasets/__init__.py
+++ b/yolox/data/datasets/__init__.py
@@ -6,4 +6,5 @@
 from .coco_classes import COCO_CLASSES
 from .datasets_wrapper import ConcatDataset, Dataset, MixConcatDataset
 from .mosaicdetection import MosaicDetection
+from .voc_classes import VOC_CLASSES
 from .voc import VOCDetection

From 2d74703b9ec377d46ca4ea7c86f1b02eaaa1fe08 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:35:06 +0800
Subject: [PATCH 16/59] Update coco_classes.py

---
 yolox/data/datasets/coco_classes.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/yolox/data/datasets/coco_classes.py b/yolox/data/datasets/coco_classes.py
index 17f5cbe6e..760945eaf 100644
--- a/yolox/data/datasets/coco_classes.py
+++ b/yolox/data/datasets/coco_classes.py
@@ -2,6 +2,10 @@
 # -*- coding:utf-8 -*-
 # Copyright (c) Megvii, Inc. and its affiliates.
 
+COCO_CLASSES = (
+    "pedestrian",
+)
+"""
 COCO_CLASSES = (
     "person",
     "bicycle",
@@ -84,3 +88,4 @@
     "hair drier",
     "toothbrush",
 )
+"""

From 2f882f36024c1170a88be2549039cd11ac552504 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:36:17 +0800
Subject: [PATCH 17/59] Update coco.py

---
 yolox/data/datasets/coco.py | 153 +++++++-----------------------------
 1 file changed, 27 insertions(+), 126 deletions(-)

diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py
index 4fbdf8836..c3381724a 100644
--- a/yolox/data/datasets/coco.py
+++ b/yolox/data/datasets/coco.py
@@ -2,36 +2,16 @@
 # -*- coding:utf-8 -*-
 # Copyright (c) Megvii, Inc. and its affiliates.
 
-import os
-from loguru import logger
-
 import cv2
 import numpy as np
 from pycocotools.coco import COCO
 
+import os
+
 from ..dataloading import get_yolox_datadir
 from .datasets_wrapper import Dataset
 
 
-def remove_useless_info(coco):
-    """
-    Remove useless info in coco dataset. COCO object is modified inplace.
-    This function is mainly used for saving memory (save about 30% mem).
-    """
-    if isinstance(coco, COCO):
-        dataset = coco.dataset
-        dataset.pop("info", None)
-        dataset.pop("licenses", None)
-        for img in dataset["images"]:
-            img.pop("license", None)
-            img.pop("coco_url", None)
-            img.pop("date_captured", None)
-            img.pop("flickr_url", None)
-        if "annotations" in coco.dataset:
-            for anno in coco.dataset["annotations"]:
-                anno.pop("segmentation", None)
-
-
 class COCODataset(Dataset):
     """
     COCO dataset class.
@@ -40,11 +20,10 @@ class COCODataset(Dataset):
     def __init__(
         self,
         data_dir=None,
-        json_file="instances_train2017.json",
-        name="train2017",
+        json_file="train_annotations.json",
+        name="train",
         img_size=(416, 416),
         preproc=None,
-        cache=False,
     ):
         """
         COCO dataset initialization. Annotation data are read into memory by COCO API.
@@ -57,82 +36,26 @@ def __init__(
         """
         super().__init__(img_size)
         if data_dir is None:
-            data_dir = os.path.join(get_yolox_datadir(), "COCO")
+            data_dir = os.path.join(get_yolox_datadir(), "pedestrian_coco")
         self.data_dir = data_dir
         self.json_file = json_file
 
         self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file))
-        remove_useless_info(self.coco)
         self.ids = self.coco.getImgIds()
         self.class_ids = sorted(self.coco.getCatIds())
-        self.cats = self.coco.loadCats(self.coco.getCatIds())
-        self._classes = tuple([c["name"] for c in self.cats])
-        self.imgs = None
+        cats = self.coco.loadCats(self.coco.getCatIds())
+        self._classes = tuple([c["name"] for c in cats])
+        self.annotations = self._load_coco_annotations()
         self.name = name
         self.img_size = img_size
         self.preproc = preproc
-        self.annotations = self._load_coco_annotations()
-        if cache:
-            self._cache_images()
 
     def __len__(self):
         return len(self.ids)
 
-    def __del__(self):
-        del self.imgs
-
     def _load_coco_annotations(self):
         return [self.load_anno_from_ids(_ids) for _ids in self.ids]
 
-    def _cache_images(self):
-        logger.warning(
-            "\n********************************************************************************\n"
-            "You are using cached images in RAM to accelerate training.\n"
-            "This requires large system RAM.\n"
-            "Make sure you have 200G+ RAM and 136G available disk space for training COCO.\n"
-            "********************************************************************************\n"
-        )
-        max_h = self.img_size[0]
-        max_w = self.img_size[1]
-        cache_file = os.path.join(self.data_dir, f"img_resized_cache_{self.name}.array")
-        if not os.path.exists(cache_file):
-            logger.info(
-                "Caching images for the first time. This might take about 20 minutes for COCO"
-            )
-            self.imgs = np.memmap(
-                cache_file,
-                shape=(len(self.ids), max_h, max_w, 3),
-                dtype=np.uint8,
-                mode="w+",
-            )
-            from tqdm import tqdm
-            from multiprocessing.pool import ThreadPool
-
-            NUM_THREADs = min(8, os.cpu_count())
-            loaded_images = ThreadPool(NUM_THREADs).imap(
-                lambda x: self.load_resized_img(x),
-                range(len(self.annotations)),
-            )
-            pbar = tqdm(enumerate(loaded_images), total=len(self.annotations))
-            for k, out in pbar:
-                self.imgs[k][: out.shape[0], : out.shape[1], :] = out.copy()
-            self.imgs.flush()
-            pbar.close()
-        else:
-            logger.warning(
-                "You are using cached imgs! Make sure your dataset is not changed!!\n"
-                "Everytime the self.input_size is changed in your exp file, you need to delete\n"
-                "the cached data and re-generate them.\n"
-            )
-
-        logger.info("Loading cached imgs...")
-        self.imgs = np.memmap(
-            cache_file,
-            shape=(len(self.ids), max_h, max_w, 3),
-            dtype=np.uint8,
-            mode="r+",
-        )
-
     def load_anno_from_ids(self, id_):
         im_ann = self.coco.loadImgs(id_)[0]
         width = im_ann["width"]
@@ -143,8 +66,8 @@ def load_anno_from_ids(self, id_):
         for obj in annotations:
             x1 = np.max((0, obj["bbox"][0]))
             y1 = np.max((0, obj["bbox"][1]))
-            x2 = np.min((width, x1 + np.max((0, obj["bbox"][2]))))
-            y2 = np.min((height, y1 + np.max((0, obj["bbox"][3]))))
+            x2 = np.min((width - 1, x1 + np.max((0, obj["bbox"][2] - 1))))
+            y2 = np.min((height - 1, y1 + np.max((0, obj["bbox"][3] - 1))))
             if obj["area"] > 0 and x2 >= x1 and y2 >= y1:
                 obj["clean_bbox"] = [x1, y1, x2, y2]
                 objs.append(obj)
@@ -158,56 +81,32 @@ def load_anno_from_ids(self, id_):
             res[ix, 0:4] = obj["clean_bbox"]
             res[ix, 4] = cls
 
-        r = min(self.img_size[0] / height, self.img_size[1] / width)
-        res[:, :4] *= r
-
         img_info = (height, width)
-        resized_info = (int(height * r), int(width * r))
 
-        file_name = (
-            im_ann["file_name"]
-            if "file_name" in im_ann
-            else "{:012}".format(id_) + ".jpg"
-        )
+        file_name = im_ann["file_name"] if "file_name" in im_ann else "{:012}".format(id_) + ".jpg"
+
+        del im_ann, annotations
 
-        return (res, img_info, resized_info, file_name)
+        return (res, img_info, file_name)
 
     def load_anno(self, index):
         return self.annotations[index][0]
 
-    def load_resized_img(self, index):
-        img = self.load_image(index)
-        r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1])
-        resized_img = cv2.resize(
-            img,
-            (int(img.shape[1] * r), int(img.shape[0] * r)),
-            interpolation=cv2.INTER_LINEAR,
-        ).astype(np.uint8)
-        return resized_img
-
-    def load_image(self, index):
-        file_name = self.annotations[index][3]
-
-        img_file = os.path.join(self.data_dir, self.name, file_name)
-
-        img = cv2.imread(img_file)
-        assert img is not None, f"file named {img_file} not found"
-
-        return img
-
     def pull_item(self, index):
         id_ = self.ids[index]
 
-        res, img_info, resized_info, _ = self.annotations[index]
-        if self.imgs is not None:
-            pad_img = self.imgs[index]
-            img = pad_img[: resized_info[0], : resized_info[1], :].copy()
-        else:
-            img = self.load_resized_img(index)
+        res, img_info, file_name = self.annotations[index]
+        # load image and preprocess
+        img_file = os.path.join(
+            self.data_dir, self.name, file_name
+        )
+
+        img = cv2.imread(img_file)
+        assert img is not None
 
-        return img, res.copy(), img_info, np.array([id_])
+        return img, res, img_info, np.array([id_])
 
-    @Dataset.mosaic_getitem
+    @Dataset.resize_getitem
     def __getitem__(self, index):
         """
         One image / label pair for the given index is picked up and pre-processed.
@@ -223,8 +122,10 @@ def __getitem__(self, index):
                     class (float): class index.
                     xc, yc (float) : center of bbox whose values range from 0 to 1.
                     w, h (float) : size of bbox whose values range from 0 to 1.
-            info_img : tuple of h, w.
+            info_img : tuple of h, w, nh, nw, dx, dy.
                 h, w (int): original shape of the image
+                nh, nw (int): shape of the resized image without padding
+                dx, dy (int): pad size
             img_id (int): same as the input index. Used for evaluation.
         """
         img, target, img_info, img_id = self.pull_item(index)

From 8087d1028c371017660e4564be67530cd7dede90 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:36:58 +0800
Subject: [PATCH 18/59] Update voc_classes.py

---
 yolox/data/datasets/voc_classes.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/yolox/data/datasets/voc_classes.py b/yolox/data/datasets/voc_classes.py
index 89354b3fd..438c5b78b 100644
--- a/yolox/data/datasets/voc_classes.py
+++ b/yolox/data/datasets/voc_classes.py
@@ -3,6 +3,11 @@
 # Copyright (c) Megvii, Inc. and its affiliates.
 
 # VOC_CLASSES = ( '__background__', # always index 0
+
+VOC_CLASSES = (
+    "pedestrian",
+)
+"""
 VOC_CLASSES = (
     "aeroplane",
     "bicycle",
@@ -25,3 +30,4 @@
     "train",
     "tvmonitor",
 )
+"""

From 716a82f7dd0d8649305948c7f855b4166074a1e2 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Jul 2022 22:37:57 +0800
Subject: [PATCH 19/59] Update voc.py

---
 yolox/data/datasets/voc.py | 170 +++++++++----------------------------
 1 file changed, 42 insertions(+), 128 deletions(-)

diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py
index 56675a297..465664aad 100644
--- a/yolox/data/datasets/voc.py
+++ b/yolox/data/datasets/voc.py
@@ -6,17 +6,16 @@
 # Copyright (c) Ellis Brown, Max deGroot.
 # Copyright (c) Megvii, Inc. and its affiliates.
 
-import os
-import os.path
-import pickle
-import xml.etree.ElementTree as ET
-from loguru import logger
-
 import cv2
 import numpy as np
 
 from yolox.evaluators.voc_eval import voc_eval
 
+import os
+import os.path
+import pickle
+import xml.etree.ElementTree as ET
+
 from .datasets_wrapper import Dataset
 from .voc_classes import VOC_CLASSES
 
@@ -36,9 +35,7 @@ class AnnotationTransform(object):
     """
 
     def __init__(self, class_to_ind=None, keep_difficult=True):
-        self.class_to_ind = class_to_ind or dict(
-            zip(VOC_CLASSES, range(len(VOC_CLASSES)))
-        )
+        self.class_to_ind = class_to_ind or dict(zip(VOC_CLASSES, range(len(VOC_CLASSES))))
         self.keep_difficult = keep_difficult
 
     def __call__(self, target):
@@ -51,20 +48,16 @@ def __call__(self, target):
         """
         res = np.empty((0, 5))
         for obj in target.iter("object"):
-            difficult = obj.find("difficult")
-            if difficult is not None:
-                difficult = int(difficult.text) == 1
-            else:
-                difficult = False
+            difficult = int(obj.find("difficult").text) == 1
             if not self.keep_difficult and difficult:
                 continue
-            name = obj.find("name").text.strip()
+            name = obj.find("name").text.lower().strip()
             bbox = obj.find("bndbox")
 
             pts = ["xmin", "ymin", "xmax", "ymax"]
             bndbox = []
             for i, pt in enumerate(pts):
-                cur_pt = int(float(bbox.find(pt).text)) - 1
+                cur_pt = int(bbox.find(pt).text) - 1
                 # scale height or width
                 # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
                 bndbox.append(cur_pt)
@@ -73,11 +66,7 @@ def __call__(self, target):
             res = np.vstack((res, bndbox))  # [xmin, ymin, xmax, ymax, label_ind]
             # img_id = target.find('filename').text[:-4]
 
-        width = int(target.find("size").find("width").text)
-        height = int(target.find("size").find("height").text)
-        img_info = (height, width)
-
-        return res, img_info
+        return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
 
 
 class VOCDetection(Dataset):
@@ -102,12 +91,11 @@ class VOCDetection(Dataset):
     def __init__(
         self,
         data_dir,
-        image_sets=[("2007", "trainval"), ("2012", "trainval")],
+        image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
         img_size=(416, 416),
         preproc=None,
         target_transform=AnnotationTransform(),
         dataset_name="VOC0712",
-        cache=False,
     ):
         super().__init__(img_size)
         self.root = data_dir
@@ -120,6 +108,13 @@ def __init__(
         self._imgpath = os.path.join("%s", "JPEGImages", "%s.jpg")
         self._classes = VOC_CLASSES
         self.ids = list()
+        for name in image_sets:
+            rootpath = self.root
+            for line in open(
+                os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
+            ):
+                self.ids.append((rootpath, line.strip()))
+        """
         for (year, name) in image_sets:
             self._year = year
             rootpath = os.path.join(self.root, "VOC" + year)
@@ -127,101 +122,18 @@ def __init__(
                 os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
             ):
                 self.ids.append((rootpath, line.strip()))
-
-        self.annotations = self._load_coco_annotations()
-        self.imgs = None
-        if cache:
-            self._cache_images()
+        """
 
     def __len__(self):
         return len(self.ids)
 
-    def _load_coco_annotations(self):
-        return [self.load_anno_from_ids(_ids) for _ids in range(len(self.ids))]
-
-    def _cache_images(self):
-        logger.warning(
-            "\n********************************************************************************\n"
-            "You are using cached images in RAM to accelerate training.\n"
-            "This requires large system RAM.\n"
-            "Make sure you have 60G+ RAM and 19G available disk space for training VOC.\n"
-            "********************************************************************************\n"
-        )
-        max_h = self.img_size[0]
-        max_w = self.img_size[1]
-        cache_file = os.path.join(self.root, f"img_resized_cache_{self.name}.array")
-        if not os.path.exists(cache_file):
-            logger.info(
-                "Caching images for the first time. This might take about 3 minutes for VOC"
-            )
-            self.imgs = np.memmap(
-                cache_file,
-                shape=(len(self.ids), max_h, max_w, 3),
-                dtype=np.uint8,
-                mode="w+",
-            )
-            from tqdm import tqdm
-            from multiprocessing.pool import ThreadPool
-
-            NUM_THREADs = min(8, os.cpu_count())
-            loaded_images = ThreadPool(NUM_THREADs).imap(
-                lambda x: self.load_resized_img(x),
-                range(len(self.annotations)),
-            )
-            pbar = tqdm(enumerate(loaded_images), total=len(self.annotations))
-            for k, out in pbar:
-                self.imgs[k][: out.shape[0], : out.shape[1], :] = out.copy()
-            self.imgs.flush()
-            pbar.close()
-        else:
-            logger.warning(
-                "You are using cached imgs! Make sure your dataset is not changed!!\n"
-                "Everytime the self.input_size is changed in your exp file, you need to delete\n"
-                "the cached data and re-generate them.\n"
-            )
-
-        logger.info("Loading cached imgs...")
-        self.imgs = np.memmap(
-            cache_file,
-            shape=(len(self.ids), max_h, max_w, 3),
-            dtype=np.uint8,
-            mode="r+",
-        )
-
-    def load_anno_from_ids(self, index):
-        img_id = self.ids[index]
-        target = ET.parse(self._annopath % img_id).getroot()
-
-        assert self.target_transform is not None
-        res, img_info = self.target_transform(target)
-        height, width = img_info
-
-        r = min(self.img_size[0] / height, self.img_size[1] / width)
-        res[:, :4] *= r
-        resized_info = (int(height * r), int(width * r))
-
-        return (res, img_info, resized_info)
-
     def load_anno(self, index):
-        return self.annotations[index][0]
-
-    def load_resized_img(self, index):
-        img = self.load_image(index)
-        r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1])
-        resized_img = cv2.resize(
-            img,
-            (int(img.shape[1] * r), int(img.shape[0] * r)),
-            interpolation=cv2.INTER_LINEAR,
-        ).astype(np.uint8)
-
-        return resized_img
-
-    def load_image(self, index):
         img_id = self.ids[index]
-        img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
-        assert img is not None, f"file named {self._imgpath % img_id} not found"
+        target = ET.parse(self._annopath % img_id).getroot()
+        if self.target_transform is not None:
+            target = self.target_transform(target)
 
-        return img
+        return target
 
     def pull_item(self, index):
         """Returns the original image and target at an index for mixup
@@ -234,17 +146,17 @@ def pull_item(self, index):
         Return:
             img, target
         """
-        if self.imgs is not None:
-            target, img_info, resized_info = self.annotations[index]
-            pad_img = self.imgs[index]
-            img = pad_img[: resized_info[0], : resized_info[1], :].copy()
-        else:
-            img = self.load_resized_img(index)
-            target, img_info, _ = self.annotations[index]
+        img_id = self.ids[index]
+        img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
+        height, width, _ = img.shape
+
+        target = self.load_anno(index)
+
+        img_info = (height, width)
 
         return img, target, img_info, index
 
-    @Dataset.mosaic_getitem
+    @Dataset.resize_getitem
     def __getitem__(self, index):
         img, target, img_info, img_id = self.pull_item(index)
 
@@ -263,9 +175,7 @@ def evaluate_detections(self, all_boxes, output_dir=None):
         all_boxes[class][image] = [] or np.array of shape #dets x 5
         """
         self._write_voc_results_file(all_boxes)
-        IouTh = np.linspace(
-            0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True
-        )
+        IouTh = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
         mAPs = []
         for iou in IouTh:
             mAP = self._do_python_eval(output_dir, iou)
@@ -279,7 +189,8 @@ def evaluate_detections(self, all_boxes, output_dir=None):
 
     def _get_voc_results_file_template(self):
         filename = "comp4_det_test" + "_{:s}.txt"
-        filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
+        #filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
+        filedir = os.path.join(self.root, "results")
         if not os.path.exists(filedir):
             os.makedirs(filedir)
         path = os.path.join(filedir, filename)
@@ -311,18 +222,21 @@ def _write_voc_results_file(self, all_boxes):
                         )
 
     def _do_python_eval(self, output_dir="output", iou=0.5):
-        rootpath = os.path.join(self.root, "VOC" + self._year)
-        name = self.image_set[0][1]
-        annopath = os.path.join(rootpath, "Annotations", "{:s}.xml")
+        #rootpath = os.path.join(self.root, "VOC" + self._year)
+        rootpath = self.root
+        name = self.image_set[0]
+        annopath = os.path.join(rootpath, "Annotations", "{}.xml")
         imagesetfile = os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
         cachedir = os.path.join(
-            self.root, "annotations_cache", "VOC" + self._year, name
+            #self.root, "annotations_cache", "VOC" + self._year, name
+            self.root, "annotations_cache"
         )
         if not os.path.exists(cachedir):
             os.makedirs(cachedir)
         aps = []
         # The PASCAL VOC metric changed in 2010
-        use_07_metric = True if int(self._year) < 2010 else False
+        use_07_metric = True
+        #use_07_metric = True if int(self._year) < 2010 else False
         print("Eval IoU : {:.2f}".format(iou))
         if output_dir is not None and not os.path.isdir(output_dir):
             os.mkdir(output_dir)

From b274e8b84488444b3c552fcb0210afd6fe0268d1 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 11:49:09 +0800
Subject: [PATCH 20/59] Update yolox_base.py

---
 yolox/exp/yolox_base.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
index 6e52e6eac..5029f42e7 100644
--- a/yolox/exp/yolox_base.py
+++ b/yolox/exp/yolox_base.py
@@ -24,8 +24,9 @@ def __init__(self):
         # factor of model width
         self.width = 1.00
         # activation name. For example, if using "relu", then "silu" will be replaced to "relu".
+        """
         self.act = "silu"
-
+        """
         # ---------------- dataloader config ---------------- #
         # set worker to 4 for shorter dataloader init time
         # If your training process cost many memory, reduce this value.
@@ -33,19 +34,25 @@ def __init__(self):
         self.input_size = (640, 640)  # (height, width)
         # Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32].
         # To disable multiscale training, set the value to 0.
+        """
         self.multiscale_range = 5
+        """
         # You can uncomment this line to specify a multiscale range
         # self.random_size = (14, 26)
         # dir of dataset images, if data_dir is None, this project will use `datasets` dir
+        """
         self.data_dir = None
+        """
         # name of annotation file for training
         self.train_ann = "instances_train2017.json"
         # name of annotation file for evaluation
         self.val_ann = "instances_val2017.json"
         # name of annotation file for testing
+        """
         self.test_ann = "instances_test2017.json"
-
+        """
         # --------------- transform config ----------------- #
+        """
         # prob of applying mosaic aug
         self.mosaic_prob = 1.0
         # prob of applying mixup aug
@@ -55,6 +62,7 @@ def __init__(self):
         # prob of applying flip aug
         self.flip_prob = 0.5
         # rotation angle range, for example, if set to 2, the true range is (-2, 2)
+        """
         self.degrees = 10.0
         # translate range, for example, if set to 0.1, the true range is (-0.1, 0.1)
         self.translate = 0.1
@@ -78,7 +86,7 @@ def __init__(self):
         # name of LRScheduler
         self.scheduler = "yoloxwarmcos"
         # last #epoch to close augmention like mosaic
-        self.no_aug_epochs = 15
+        self.no_aug_epochs = 0
         # apply EMA during training
         self.ema = True
 

From 2891d235617e34e4f45ce3f5b0798aabf657d08e Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 11:53:38 +0800
Subject: [PATCH 21/59] Update visualize.py

---
 yolox/utils/visualize.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py
index e714a3ee7..b31741f3d 100644
--- a/yolox/utils/visualize.py
+++ b/yolox/utils/visualize.py
@@ -9,6 +9,11 @@
 
 
 def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
+    class_count = {}
+    class_AP = {}
+    for j in class_names:
+        class_count[j] = 0
+        class_AP[j] = 0
 
     for i in range(len(boxes)):
         box = boxes[i]
@@ -37,8 +42,18 @@ def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
             txt_bk_color,
             -1
         )
+        class_count[class_names[cls_id]] = class_count[class_names[cls_id]]+1
+        class_AP[class_names[cls_id]] = class_AP[class_names[cls_id]]+float('{:.1f}'.format(score * 100))
         cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
-
+        line = 0
+        for k in class_count:  
+            cv2.putText(img, str(k)+": "+str(class_count[k]), (15,25+line), font, 0.8, (0, 255, 255), thickness=2)
+            if class_count[k] !=0:
+                class_AP[k]=class_AP[k]/class_count[k]
+            else:
+                class_AP[k]=0.0
+            cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (15,50+line), font, 0.8, (0, 255, 255), thickness=2)
+            line = line+50
     return img
 
 

From 1d8f8dfc9ad85c679b6be828d86fedf0791c05fe Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 12:13:04 +0800
Subject: [PATCH 22/59] Add files via upload

---
 datasets/pedestrian_coco/annotations/train_annotations.json | 0
 datasets/pedestrian_coco/annotations/valid_annotations.json | 0
 datasets/pedestrian_voc/ImageSets/Main/train.txt            | 0
 datasets/pedestrian_voc/ImageSets/Main/valid.txt            | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 datasets/pedestrian_coco/annotations/train_annotations.json
 create mode 100644 datasets/pedestrian_coco/annotations/valid_annotations.json
 create mode 100644 datasets/pedestrian_voc/ImageSets/Main/train.txt
 create mode 100644 datasets/pedestrian_voc/ImageSets/Main/valid.txt

diff --git a/datasets/pedestrian_coco/annotations/train_annotations.json b/datasets/pedestrian_coco/annotations/train_annotations.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_coco/annotations/valid_annotations.json b/datasets/pedestrian_coco/annotations/valid_annotations.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_voc/ImageSets/Main/train.txt b/datasets/pedestrian_voc/ImageSets/Main/train.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_voc/ImageSets/Main/valid.txt b/datasets/pedestrian_voc/ImageSets/Main/valid.txt
new file mode 100644
index 000000000..e69de29bb

From bf8e247b350e62d0bc2cc2faf62e2b04aaec762d Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 12:13:37 +0800
Subject: [PATCH 23/59] Delete datasets directory

---
 datasets/README.md                            | 24 -------------------
 .../annotations/train_annotations.json        |  0
 .../annotations/valid_annotations.json        |  0
 .../pedestrian_voc/ImageSets/Main/train.txt   |  0
 .../pedestrian_voc/ImageSets/Main/valid.txt   |  0
 5 files changed, 24 deletions(-)
 delete mode 100644 datasets/README.md
 delete mode 100644 datasets/pedestrian_coco/annotations/train_annotations.json
 delete mode 100644 datasets/pedestrian_coco/annotations/valid_annotations.json
 delete mode 100644 datasets/pedestrian_voc/ImageSets/Main/train.txt
 delete mode 100644 datasets/pedestrian_voc/ImageSets/Main/valid.txt

diff --git a/datasets/README.md b/datasets/README.md
deleted file mode 100644
index 76f7c310c..000000000
--- a/datasets/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Prepare datasets
-
-If you have a dataset directory, you could use os environment variable named `YOLOX_DATADIR`. Under this directory, YOLOX will look for datasets in the structure described below, if needed.
-```
-$YOLOX_DATADIR/
-  COCO/
-```
-You can set the location for builtin datasets by
-```shell
-export YOLOX_DATADIR=/path/to/your/datasets
-```
-If `YOLOX_DATADIR` is not set, the default value of dataset directory is `./datasets` relative to your current working directory.
-
-## Expected dataset structure for [COCO detection](https://cocodataset.org/#download):
-
-```
-COCO/
-  annotations/
-    instances_{train,val}2017.json
-  {train,val}2017/
-    # image files that are mentioned in the corresponding json
-```
-
-You can use the 2014 version of the dataset as well.
diff --git a/datasets/pedestrian_coco/annotations/train_annotations.json b/datasets/pedestrian_coco/annotations/train_annotations.json
deleted file mode 100644
index e69de29bb..000000000
diff --git a/datasets/pedestrian_coco/annotations/valid_annotations.json b/datasets/pedestrian_coco/annotations/valid_annotations.json
deleted file mode 100644
index e69de29bb..000000000
diff --git a/datasets/pedestrian_voc/ImageSets/Main/train.txt b/datasets/pedestrian_voc/ImageSets/Main/train.txt
deleted file mode 100644
index e69de29bb..000000000
diff --git a/datasets/pedestrian_voc/ImageSets/Main/valid.txt b/datasets/pedestrian_voc/ImageSets/Main/valid.txt
deleted file mode 100644
index e69de29bb..000000000

From 83a313d0f240f3381c134d2dbf55a71961917999 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 12:19:34 +0800
Subject: [PATCH 24/59] Add files via upload

---
 datasets/README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 datasets/README.md

diff --git a/datasets/README.md b/datasets/README.md
new file mode 100644
index 000000000..76f7c310c
--- /dev/null
+++ b/datasets/README.md
@@ -0,0 +1,24 @@
+# Prepare datasets
+
+If you have a dataset directory, you could use os environment variable named `YOLOX_DATADIR`. Under this directory, YOLOX will look for datasets in the structure described below, if needed.
+```
+$YOLOX_DATADIR/
+  COCO/
+```
+You can set the location for builtin datasets by
+```shell
+export YOLOX_DATADIR=/path/to/your/datasets
+```
+If `YOLOX_DATADIR` is not set, the default value of dataset directory is `./datasets` relative to your current working directory.
+
+## Expected dataset structure for [COCO detection](https://cocodataset.org/#download):
+
+```
+COCO/
+  annotations/
+    instances_{train,val}2017.json
+  {train,val}2017/
+    # image files that are mentioned in the corresponding json
+```
+
+You can use the 2014 version of the dataset as well.

From 2231daa7a3a5a9b684a9c0cea9f9fedba9d8299c Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 12:23:24 +0800
Subject: [PATCH 25/59] Add files via upload

---
 datasets/pedestrian_coco/annotations/train_annotations.json | 0
 datasets/pedestrian_coco/annotations/valid_annotations.json | 0
 datasets/pedestrian_coco/train/README.md                    | 1 +
 datasets/pedestrian_coco/valid/README.md                    | 1 +
 datasets/pedestrian_voc/Annotations/README.md               | 1 +
 datasets/pedestrian_voc/ImageSets/Main/train.txt            | 0
 datasets/pedestrian_voc/ImageSets/Main/valid.txt            | 0
 datasets/pedestrian_voc/JPEGImages/README.md                | 1 +
 8 files changed, 4 insertions(+)
 create mode 100644 datasets/pedestrian_coco/annotations/train_annotations.json
 create mode 100644 datasets/pedestrian_coco/annotations/valid_annotations.json
 create mode 100644 datasets/pedestrian_coco/train/README.md
 create mode 100644 datasets/pedestrian_coco/valid/README.md
 create mode 100644 datasets/pedestrian_voc/Annotations/README.md
 create mode 100644 datasets/pedestrian_voc/ImageSets/Main/train.txt
 create mode 100644 datasets/pedestrian_voc/ImageSets/Main/valid.txt
 create mode 100644 datasets/pedestrian_voc/JPEGImages/README.md

diff --git a/datasets/pedestrian_coco/annotations/train_annotations.json b/datasets/pedestrian_coco/annotations/train_annotations.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_coco/annotations/valid_annotations.json b/datasets/pedestrian_coco/annotations/valid_annotations.json
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_coco/train/README.md b/datasets/pedestrian_coco/train/README.md
new file mode 100644
index 000000000..7b9800885
--- /dev/null
+++ b/datasets/pedestrian_coco/train/README.md
@@ -0,0 +1 @@
+put the train images
\ No newline at end of file
diff --git a/datasets/pedestrian_coco/valid/README.md b/datasets/pedestrian_coco/valid/README.md
new file mode 100644
index 000000000..657f11b45
--- /dev/null
+++ b/datasets/pedestrian_coco/valid/README.md
@@ -0,0 +1 @@
+put the valid images
\ No newline at end of file
diff --git a/datasets/pedestrian_voc/Annotations/README.md b/datasets/pedestrian_voc/Annotations/README.md
new file mode 100644
index 000000000..2b9cfa086
--- /dev/null
+++ b/datasets/pedestrian_voc/Annotations/README.md
@@ -0,0 +1 @@
+put the train and valid annotations
\ No newline at end of file
diff --git a/datasets/pedestrian_voc/ImageSets/Main/train.txt b/datasets/pedestrian_voc/ImageSets/Main/train.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_voc/ImageSets/Main/valid.txt b/datasets/pedestrian_voc/ImageSets/Main/valid.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/datasets/pedestrian_voc/JPEGImages/README.md b/datasets/pedestrian_voc/JPEGImages/README.md
new file mode 100644
index 000000000..62ac03c99
--- /dev/null
+++ b/datasets/pedestrian_voc/JPEGImages/README.md
@@ -0,0 +1 @@
+put the train and valid images
\ No newline at end of file

From 44be20eb94838b09c1ed27b3fb16814fe8b7e11c Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 12:42:49 +0800
Subject: [PATCH 26/59] Update yolox_voc_s.py

---
 .../voc_format/yolox_voc_s/yolox_voc_s.py     | 69 +++++++++++--------
 1 file changed, 42 insertions(+), 27 deletions(-)

diff --git a/exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py b/exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py
index 5f108b42b..e5cdb6103 100644
--- a/exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py
+++ b/exps/example/custom/voc_format/yolox_voc_s/yolox_voc_s.py
@@ -1,12 +1,11 @@
 # encoding: utf-8
 import os
-import random
+
 import torch
-import torch.nn as nn
 import torch.distributed as dist
 
-from yolox.exp import Exp as MyExp
 from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
 
 
 class Exp(MyExp):
@@ -15,9 +14,17 @@ def __init__(self):
         self.num_classes = 20
         self.depth = 0.33
         self.width = 0.50
+        self.warmup_epochs = 1
+
+        # ---------- transform config ------------ #
+        self.mosaic_prob = 1.0
+        self.mixup_prob = 1.0
+        self.hsv_prob = 1.0
+        self.flip_prob = 0.5
+
         self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
         from yolox.data import (
             VOCDetection,
             TrainTransform,
@@ -25,34 +32,42 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             DataLoader,
             InfiniteSampler,
             MosaicDetection,
+            worker_init_reset_seed,
         )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
+        from yolox.utils import (
+            wait_for_the_master,
+            get_local_rank,
         )
+        local_rank = get_local_rank()
+
+        with wait_for_the_master(local_rank):
+            dataset = VOCDetection(
+                data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+                image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+                img_size=self.input_size,
+                preproc=TrainTransform(
+                    max_labels=50,
+                    flip_prob=self.flip_prob,
+                    hsv_prob=self.hsv_prob),
+                cache=cache_img,
+            )
 
         dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
             preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
                 max_labels=120,
-            ),
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob),
             degrees=self.degrees,
             translate=self.translate,
-            scale=self.scale,
+            mosaic_scale=self.mosaic_scale,
+            mixup_scale=self.mixup_scale,
             shear=self.shear,
-            perspective=self.perspective,
             enable_mixup=self.enable_mixup,
+            mosaic_prob=self.mosaic_prob,
+            mixup_prob=self.mixup_prob,
         )
 
         self.dataset = dataset
@@ -68,27 +83,27 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             sampler=sampler,
             batch_size=batch_size,
             drop_last=False,
-            input_dimension=self.input_size,
             mosaic=not no_aug,
         )
 
         dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
         dataloader_kwargs["batch_sampler"] = batch_sampler
+
+        # Make sure each process has different random seed, especially for 'fork' method
+        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
         train_loader = DataLoader(self.dataset, **dataloader_kwargs)
 
         return train_loader
 
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.data import VOCDetection, ValTransform
 
         valdataset = VOCDetection(
             data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
             image_sets=[('2007', 'test')],
             img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
+            preproc=ValTransform(legacy=legacy),
         )
 
         if is_distributed:
@@ -109,10 +124,10 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False):
 
         return val_loader
 
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.evaluators import VOCEvaluator
 
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
         evaluator = VOCEvaluator(
             dataloader=val_loader,
             img_size=self.test_size,

From 8fd95f668f0196d459194708c58fa9e9356759b0 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 13:09:57 +0800
Subject: [PATCH 27/59] Update yolox_nano.py


From 1ea66fd02e8897032c85bd55126de2cb9399a9f0 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 13:10:43 +0800
Subject: [PATCH 28/59] Update yolox_s.py


From 6823fda0529c21294a8a007c56361e0b252421d7 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 13:29:04 +0800
Subject: [PATCH 29/59] Update yolox_voc_nano.py

---
 .../yolox_voc_nano/yolox_voc_nano.py          | 78 +++++++++++--------
 1 file changed, 47 insertions(+), 31 deletions(-)

diff --git a/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
index 0bba25ffa..0410e9664 100644
--- a/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
+++ b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
@@ -2,15 +2,13 @@
 # -*- coding:utf-8 -*-
 # Copyright (c) Megvii, Inc. and its affiliates.
 
-from yolox.data import get_yolox_datadir
-from yolox.exp import Exp as MyExp
 import os
-import random
-import torch.nn as nn
+
 import torch
 import torch.distributed as dist
-import sys
-sys.path.append(r'D:/YOLOX')
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
 
 
 class Exp(MyExp):
@@ -19,8 +17,18 @@ def __init__(self):
         self.num_classes = 20
         self.depth = 0.33
         self.width = 0.25
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.warmup_epochs = 1
+        # ---------- transform config ------------ #
+        self.mosaic_prob = 1.0
         self.enable_mixup = False
+        #self.mixup_prob = 1.0
+        self.hsv_prob = 1.0
+        self.flip_prob = 0.5
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
     def get_model(self, sublinear=False):
 
@@ -41,7 +49,7 @@ def init_yolo(M):
         self.model.head.initialize_biases(1e-2)
         return self.model
 
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+   def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
         from yolox.data import (
             VOCDetection,
             TrainTransform,
@@ -49,34 +57,42 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             DataLoader,
             InfiniteSampler,
             MosaicDetection,
+            worker_init_reset_seed,
         )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
+        from yolox.utils import (
+            wait_for_the_master,
+            get_local_rank,
         )
+        local_rank = get_local_rank()
+
+        with wait_for_the_master(local_rank):
+            dataset = VOCDetection(
+                data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+                image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+                img_size=self.input_size,
+                preproc=TrainTransform(
+                    max_labels=50,
+                    flip_prob=self.flip_prob,
+                    hsv_prob=self.hsv_prob),
+                cache=cache_img,
+            )
 
         dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
             preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
                 max_labels=120,
-            ),
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob),
             degrees=self.degrees,
             translate=self.translate,
-            scale=self.scale,
+            mosaic_scale=self.mosaic_scale,
+            mixup_scale=self.mixup_scale,
             shear=self.shear,
-            perspective=self.perspective,
             enable_mixup=self.enable_mixup,
+            mosaic_prob=self.mosaic_prob,
+            mixup_prob=self.mixup_prob,
         )
 
         self.dataset = dataset
@@ -92,27 +108,27 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             sampler=sampler,
             batch_size=batch_size,
             drop_last=False,
-            input_dimension=self.input_size,
             mosaic=not no_aug,
         )
 
         dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
         dataloader_kwargs["batch_sampler"] = batch_sampler
+
+        # Make sure each process has different random seed, especially for 'fork' method
+        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
         train_loader = DataLoader(self.dataset, **dataloader_kwargs)
 
         return train_loader
 
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.data import VOCDetection, ValTransform
 
         valdataset = VOCDetection(
             data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
             image_sets=[('2007', 'test')],
             img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
+            preproc=ValTransform(legacy=legacy),
         )
 
         if is_distributed:
@@ -133,10 +149,10 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False):
 
         return val_loader
 
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.evaluators import VOCEvaluator
 
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
         evaluator = VOCEvaluator(
             dataloader=val_loader,
             img_size=self.test_size,

From feed57ddd8090cb228b7a3b9177f97449947bc82 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 13:38:39 +0800
Subject: [PATCH 30/59] Update yolox_voc_nano.py

---
 .../custom/voc_format/yolox_voc_nano/yolox_voc_nano.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
index 0410e9664..5e4fb127f 100644
--- a/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
+++ b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
@@ -23,11 +23,11 @@ def __init__(self):
         self.test_size = (416, 416)
         self.warmup_epochs = 1
         # ---------- transform config ------------ #
-        self.mosaic_prob = 1.0
+        #self.mosaic_prob = 1.0
         self.enable_mixup = False
         #self.mixup_prob = 1.0
-        self.hsv_prob = 1.0
-        self.flip_prob = 0.5
+        #self.hsv_prob = 1.0
+        #self.flip_prob = 0.5
         self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
     def get_model(self, sublinear=False):

From ce9d0e163d4d9e37339b46a0e551e007afc33bca Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 13:40:09 +0800
Subject: [PATCH 31/59] Update yolox_voc_tiny.py

---
 .../yolox_voc_tiny/yolox_voc_tiny.py          | 79 +++++++++++--------
 1 file changed, 48 insertions(+), 31 deletions(-)

diff --git a/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py b/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
index 499b2a59a..1ffee8079 100644
--- a/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
+++ b/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
@@ -3,21 +3,31 @@
 # Copyright (c) Megvii, Inc. and its affiliates.
 
 import os
-import random
-import torch.nn as nn
+
 import torch
 import torch.distributed as dist
-import sys
-sys.path.append(r'D:/YOLOX')
-from yolox.exp import Exp as MyExp
+
 from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
 
 
 class Exp(MyExp):
     def __init__(self):
         super(Exp, self).__init__()
+        self.num_classes = 20
         self.depth = 0.33
-        self.width = 0.375
+        self.width = 0.75
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.warmup_epochs = 1
+        # ---------- transform config ------------ #
+        #self.mosaic_prob = 1.0
+        self.enable_mixup = False
+        #self.mixup_prob = 1.0
+        #self.hsv_prob = 1.0
+        #self.flip_prob = 0.5
         self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
     def get_model(self, sublinear=False):
@@ -27,7 +37,6 @@ def init_yolo(M):
                 if isinstance(m, nn.BatchNorm2d):
                     m.eps = 1e-3
                     m.momentum = 0.03
-
         if "model" not in self.__dict__:
             from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
             in_channels = [256, 512, 1024]
@@ -40,7 +49,7 @@ def init_yolo(M):
         self.model.head.initialize_biases(1e-2)
         return self.model
 
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+   def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
         from yolox.data import (
             VOCDetection,
             TrainTransform,
@@ -48,34 +57,42 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             DataLoader,
             InfiniteSampler,
             MosaicDetection,
+            worker_init_reset_seed,
         )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
+        from yolox.utils import (
+            wait_for_the_master,
+            get_local_rank,
         )
+        local_rank = get_local_rank()
+
+        with wait_for_the_master(local_rank):
+            dataset = VOCDetection(
+                data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+                image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+                img_size=self.input_size,
+                preproc=TrainTransform(
+                    max_labels=50,
+                    flip_prob=self.flip_prob,
+                    hsv_prob=self.hsv_prob),
+                cache=cache_img,
+            )
 
         dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
             preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
                 max_labels=120,
-            ),
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob),
             degrees=self.degrees,
             translate=self.translate,
-            scale=self.scale,
+            mosaic_scale=self.mosaic_scale,
+            mixup_scale=self.mixup_scale,
             shear=self.shear,
-            perspective=self.perspective,
             enable_mixup=self.enable_mixup,
+            mosaic_prob=self.mosaic_prob,
+            mixup_prob=self.mixup_prob,
         )
 
         self.dataset = dataset
@@ -91,27 +108,27 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             sampler=sampler,
             batch_size=batch_size,
             drop_last=False,
-            input_dimension=self.input_size,
             mosaic=not no_aug,
         )
 
         dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
         dataloader_kwargs["batch_sampler"] = batch_sampler
+
+        # Make sure each process has different random seed, especially for 'fork' method
+        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
         train_loader = DataLoader(self.dataset, **dataloader_kwargs)
 
         return train_loader
 
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.data import VOCDetection, ValTransform
 
         valdataset = VOCDetection(
             data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
             image_sets=[('2007', 'test')],
             img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
+            preproc=ValTransform(legacy=legacy),
         )
 
         if is_distributed:
@@ -132,10 +149,10 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False):
 
         return val_loader
 
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.evaluators import VOCEvaluator
 
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
         evaluator = VOCEvaluator(
             dataloader=val_loader,
             img_size=self.test_size,

From 0eaf0b88bc6edce2f094201fd147ac21f7b818dc Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 13:48:26 +0800
Subject: [PATCH 32/59] Update yolox_voc_tiny.py

---
 .../yolox_voc_tiny/yolox_voc_tiny.py          | 25 ++-----------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py b/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
index 1ffee8079..d8ffbdac9 100644
--- a/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
+++ b/exps/example/custom/voc_format/yolox_voc_tiny/yolox_voc_tiny.py
@@ -14,11 +14,9 @@
 class Exp(MyExp):
     def __init__(self):
         super(Exp, self).__init__()
-        self.num_classes = 20
         self.depth = 0.33
-        self.width = 0.75
-        self.input_size = (416, 416)
-        self.mosaic_scale = (0.5, 1.5)
+        self.width = 0.375
+        self.scale = (0.5, 1.5)
         self.random_size = (10, 20)
         self.test_size = (416, 416)
         self.warmup_epochs = 1
@@ -30,25 +28,6 @@ def __init__(self):
         #self.flip_prob = 0.5
         self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
-    def get_model(self, sublinear=False):
-
-        def init_yolo(M):
-            for m in M.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eps = 1e-3
-                    m.momentum = 0.03
-        if "model" not in self.__dict__:
-            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
-            in_channels = [256, 512, 1024]
-            # NANO model use depthwise = True, which is main difference.
-            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
-            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
-            self.model = YOLOX(backbone, head)
-
-        self.model.apply(init_yolo)
-        self.model.head.initialize_biases(1e-2)
-        return self.model
-
    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
         from yolox.data import (
             VOCDetection,

From 96de204cb4a8fb9fce7fb76324f35e2ceea01d98 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:02:00 +0800
Subject: [PATCH 33/59] Update yolox_voc_nano_adam.py

---
 .../yolox_voc_nano_adam.py                    | 83 +++++++++++--------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py b/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py
index 1663c88f9..11b82f646 100644
--- a/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py
+++ b/exps/example/custom/voc_format/yolox_voc_nano_adam/yolox_voc_nano_adam.py
@@ -3,27 +3,33 @@
 # Copyright (c) Megvii, Inc. and its affiliates.
 
 import os
-import random
-import torch.nn as nn
+
 import torch
 import torch.distributed as dist
-import sys
-sys.path.append(r'D:/YOLOX')
-from yolox.exp import Exp as MyExp
+
 from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
 
 
 class Exp(MyExp):
     def __init__(self):
         super(Exp, self).__init__()
-        self.num_classes = 1
+        self.num_classes = 20
         self.depth = 0.33
         self.width = 0.25
-        self.scale = (0.5, 1.5)
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
         self.random_size = (10, 20)
+        self.test_size = (416, 416)
         self.eps = 1e-8
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.warmup_epochs = 1
+        # ---------- transform config ------------ #
+        #self.mosaic_prob = 1.0
         self.enable_mixup = False
+        #self.mixup_prob = 1.0
+        #self.hsv_prob = 1.0
+        #self.flip_prob = 0.5
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
     def get_model(self, sublinear=False):
 
@@ -43,7 +49,7 @@ def init_yolo(M):
         self.model.apply(init_yolo)
         self.model.head.initialize_biases(1e-2)
         return self.model
-
+        
     def get_optimizer(self, batch_size):
         if "optimizer" not in self.__dict__:
             if self.warmup_epochs > 0:
@@ -69,10 +75,9 @@ def get_optimizer(self, batch_size):
             )  # add pg1 with weight_decay
             optimizer.add_param_group({"params": pg2})
             self.optimizer = optimizer
-
         return self.optimizer
-    
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        
+   def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
         from yolox.data import (
             VOCDetection,
             TrainTransform,
@@ -80,34 +85,42 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             DataLoader,
             InfiniteSampler,
             MosaicDetection,
+            worker_init_reset_seed,
         )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
+        from yolox.utils import (
+            wait_for_the_master,
+            get_local_rank,
         )
+        local_rank = get_local_rank()
+
+        with wait_for_the_master(local_rank):
+            dataset = VOCDetection(
+                data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+                image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+                img_size=self.input_size,
+                preproc=TrainTransform(
+                    max_labels=50,
+                    flip_prob=self.flip_prob,
+                    hsv_prob=self.hsv_prob),
+                cache=cache_img,
+            )
 
         dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
             preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
                 max_labels=120,
-            ),
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob),
             degrees=self.degrees,
             translate=self.translate,
-            scale=self.scale,
+            mosaic_scale=self.mosaic_scale,
+            mixup_scale=self.mixup_scale,
             shear=self.shear,
-            perspective=self.perspective,
             enable_mixup=self.enable_mixup,
+            mosaic_prob=self.mosaic_prob,
+            mixup_prob=self.mixup_prob,
         )
 
         self.dataset = dataset
@@ -123,27 +136,27 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             sampler=sampler,
             batch_size=batch_size,
             drop_last=False,
-            input_dimension=self.input_size,
             mosaic=not no_aug,
         )
 
         dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
         dataloader_kwargs["batch_sampler"] = batch_sampler
+
+        # Make sure each process has different random seed, especially for 'fork' method
+        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
         train_loader = DataLoader(self.dataset, **dataloader_kwargs)
 
         return train_loader
 
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.data import VOCDetection, ValTransform
 
         valdataset = VOCDetection(
             data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
             image_sets=[('2007', 'test')],
             img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
+            preproc=ValTransform(legacy=legacy),
         )
 
         if is_distributed:
@@ -164,10 +177,10 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False):
 
         return val_loader
 
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.evaluators import VOCEvaluator
 
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
         evaluator = VOCEvaluator(
             dataloader=val_loader,
             img_size=self.test_size,

From 9a858cd1a50b6708af24d5cb57935caa91d126d0 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:08:36 +0800
Subject: [PATCH 34/59] Update demo.py

---
 tools/demo.py | 87 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 34 deletions(-)

diff --git a/tools/demo.py b/tools/demo.py
index 1e505a3aa..0e966900c 100644
--- a/tools/demo.py
+++ b/tools/demo.py
@@ -2,21 +2,20 @@
 # -*- coding:utf-8 -*-
 # Copyright (c) Megvii, Inc. and its affiliates.
 
+import argparse
+import os
+import time
 from loguru import logger
 
 import cv2
 
 import torch
 
-from yolox.data.data_augment import preproc
-from yolox.data.datasets import COCO_CLASSES, VOC_CLASSES
+from yolox.data.data_augment import ValTransform
+from yolox.data.datasets import COCO_CLASSES
 from yolox.exp import get_exp
 from yolox.utils import fuse_model, get_model_info, postprocess, vis
 
-import argparse
-import os
-import time
-
 IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]
 
 
@@ -44,7 +43,7 @@ def make_parser():
         "--exp_file",
         default=None,
         type=str,
-        help="pls input your expriment description file",
+        help="please input your experiment description file",
     )
     parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
     parser.add_argument(
@@ -53,8 +52,8 @@ def make_parser():
         type=str,
         help="device to run our model, can either be cpu or gpu",
     )
-    parser.add_argument("--conf", default=None, type=float, help="test conf")
-    parser.add_argument("--nms", default=None, type=float, help="test nms threshold")
+    parser.add_argument("--conf", default=0.3, type=float, help="test conf")
+    parser.add_argument("--nms", default=0.3, type=float, help="test nms threshold")
     parser.add_argument("--tsize", default=None, type=int, help="test img size")
     parser.add_argument(
         "--fp16",
@@ -63,6 +62,13 @@ def make_parser():
         action="store_true",
         help="Adopting mix precision evaluating.",
     )
+    parser.add_argument(
+        "--legacy",
+        dest="legacy",
+        default=False,
+        action="store_true",
+        help="To be compatible with older versions",
+    )
     parser.add_argument(
         "--fuse",
         dest="fuse",
@@ -96,10 +102,12 @@ def __init__(
         self,
         model,
         exp,
-        cls_names=VOC_CLASSES,
+        cls_names=COCO_CLASSES,
         trt_file=None,
         decoder=None,
         device="cpu",
+        fp16=False,
+        legacy=False,
     ):
         self.model = model
         self.cls_names = cls_names
@@ -109,6 +117,8 @@ def __init__(
         self.nmsthre = exp.nmsthre
         self.test_size = exp.test_size
         self.device = device
+        self.fp16 = fp16
+        self.preproc = ValTransform(legacy=legacy)
         if trt_file is not None:
             from torch2trt import TRTModule
 
@@ -118,8 +128,6 @@ def __init__(
             x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
             self.model(x)
             self.model = model_trt
-        self.rgb_means = (0.485, 0.456, 0.406)
-        self.std = (0.229, 0.224, 0.225)
 
     def inference(self, img):
         img_info = {"id": 0}
@@ -134,23 +142,25 @@ def inference(self, img):
         img_info["width"] = width
         img_info["raw_img"] = img
 
-        img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
+        ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
         img_info["ratio"] = ratio
+
+        img, _ = self.preproc(img, None, self.test_size)
         img = torch.from_numpy(img).unsqueeze(0)
+        img = img.float()
         if self.device == "gpu":
             img = img.cuda()
+            if self.fp16:
+                img = img.half()  # to FP16
 
         with torch.no_grad():
             t0 = time.time()
             outputs = self.model(img)
-            #print(type(outputs)) # torch.Tensor
-            print(len(outputs))  # 1
-            print(outputs.shape)  # (1,8400,6)
-            print(outputs.tolist())  # print complete list.
             if self.decoder is not None:
                 outputs = self.decoder(outputs, dtype=outputs.type())
             outputs = postprocess(
-                outputs, self.num_classes, self.confthre, self.nmsthre
+                outputs, self.num_classes, self.confthre,
+                self.nmsthre, class_agnostic=True
             )
             logger.info("Infer time: {:.4f}s".format(time.time() - t0))
         return outputs, img_info
@@ -183,7 +193,6 @@ def visual(self, output, img_info, cls_conf=0.35):
         vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
         return vis_res
 
-
 def image_demo(predictor, vis_folder, path, current_time, save_result):
     if os.path.isdir(path):
         files = get_image_list(path)
@@ -211,18 +220,19 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
     width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  # float
     height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  # float
     fps = cap.get(cv2.CAP_PROP_FPS)
-    save_folder = os.path.join(
-        vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
-    )
-    os.makedirs(save_folder, exist_ok=True)
-    if args.demo == "video":
-        save_path = os.path.join(save_folder, args.path.split("/")[-1])
-    else:
-        save_path = os.path.join(save_folder, "camera.mp4")
-    logger.info(f"video save_path is {save_path}")
-    vid_writer = cv2.VideoWriter(
-        save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
-    )
+    if args.save_result:
+        save_folder = os.path.join(
+            vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+        )
+        os.makedirs(save_folder, exist_ok=True)
+        if args.demo == "video":
+            save_path = os.path.join(save_folder, os.path.basename(args.path))
+        else:
+            save_path = os.path.join(save_folder, "camera.mp4")
+        logger.info(f"video save_path is {save_path}")
+        vid_writer = cv2.VideoWriter(
+            save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
+        )
     while True:
         ret_val, frame = cap.read()
         if ret_val:
@@ -230,7 +240,10 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
             result_frame = predictor.visual(outputs[0], img_info, predictor.confthre)
             if args.save_result:
                 vid_writer.write(result_frame)
-            ch = cv2.waitKey(1)
+            else:
+                cv2.namedWindow("yolox", cv2.WINDOW_NORMAL)
+                cv2.imshow("yolox", result_frame)
+            ch = cv2.waitKey(1000)
             if ch == 27 or ch == ord("q") or ch == ord("Q"):
                 break
         else:
@@ -244,6 +257,7 @@ def main(exp, args):
     file_name = os.path.join(exp.output_dir, args.experiment_name)
     os.makedirs(file_name, exist_ok=True)
 
+    vis_folder = None
     if args.save_result:
         vis_folder = os.path.join(file_name, "vis_res")
         os.makedirs(vis_folder, exist_ok=True)
@@ -265,11 +279,13 @@ def main(exp, args):
 
     if args.device == "gpu":
         model.cuda()
+        if args.fp16:
+            model.half()  # to FP16
     model.eval()
 
     if not args.trt:
         if args.ckpt is None:
-            ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
+            ckpt_file = os.path.join(file_name, "best_ckpt.pth")
         else:
             ckpt_file = args.ckpt
         logger.info("loading checkpoint")
@@ -295,7 +311,10 @@ def main(exp, args):
         trt_file = None
         decoder = None
 
-    predictor = Predictor(model, exp, VOC_CLASSES, trt_file, decoder, args.device)
+    predictor = Predictor(
+        model, exp, COCO_CLASSES, trt_file, decoder,
+        args.device, args.fp16, args.legacy,
+    )
     current_time = time.localtime()
     if args.demo == "image":
         image_demo(predictor, vis_folder, args.path, current_time, args.save_result)

From 4a1acd4f687adc3e05f344f878697ae7b6e33e38 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:20:23 +0800
Subject: [PATCH 35/59] Update coco.py

---
 yolox/data/datasets/coco.py | 147 ++++++++++++++++++++++++++++++------
 1 file changed, 123 insertions(+), 24 deletions(-)

diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py
index c3381724a..5ead905e4 100644
--- a/yolox/data/datasets/coco.py
+++ b/yolox/data/datasets/coco.py
@@ -2,16 +2,36 @@
 # -*- coding:utf-8 -*-
 # Copyright (c) Megvii, Inc. and its affiliates.
 
+import os
+from loguru import logger
+
 import cv2
 import numpy as np
 from pycocotools.coco import COCO
 
-import os
-
 from ..dataloading import get_yolox_datadir
 from .datasets_wrapper import Dataset
 
 
+def remove_useless_info(coco):
+    """
+    Remove useless info in coco dataset. COCO object is modified inplace.
+    This function is mainly used for saving memory (save about 30% mem).
+    """
+    if isinstance(coco, COCO):
+        dataset = coco.dataset
+        dataset.pop("info", None)
+        dataset.pop("licenses", None)
+        for img in dataset["images"]:
+            img.pop("license", None)
+            img.pop("coco_url", None)
+            img.pop("date_captured", None)
+            img.pop("flickr_url", None)
+        if "annotations" in coco.dataset:
+            for anno in coco.dataset["annotations"]:
+                anno.pop("segmentation", None)
+
+
 class COCODataset(Dataset):
     """
     COCO dataset class.
@@ -24,6 +44,7 @@ def __init__(
         name="train",
         img_size=(416, 416),
         preproc=None,
+        cache=False,
     ):
         """
         COCO dataset initialization. Annotation data are read into memory by COCO API.
@@ -41,21 +62,77 @@ def __init__(
         self.json_file = json_file
 
         self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file))
+        remove_useless_info(self.coco)
         self.ids = self.coco.getImgIds()
         self.class_ids = sorted(self.coco.getCatIds())
-        cats = self.coco.loadCats(self.coco.getCatIds())
-        self._classes = tuple([c["name"] for c in cats])
-        self.annotations = self._load_coco_annotations()
+        self.cats = self.coco.loadCats(self.coco.getCatIds())
+        self._classes = tuple([c["name"] for c in self.cats])
+        self.imgs = None
         self.name = name
         self.img_size = img_size
         self.preproc = preproc
+        self.annotations = self._load_coco_annotations()
+        if cache:
+            self._cache_images()
 
     def __len__(self):
         return len(self.ids)
 
+    def __del__(self):
+        del self.imgs
+
     def _load_coco_annotations(self):
         return [self.load_anno_from_ids(_ids) for _ids in self.ids]
 
+    def _cache_images(self):
+        logger.warning(
+            "\n********************************************************************************\n"
+            "You are using cached images in RAM to accelerate training.\n"
+            "This requires large system RAM.\n"
+            "Make sure you have 200G+ RAM and 136G available disk space for training COCO.\n"
+            "********************************************************************************\n"
+        )
+        max_h = self.img_size[0]
+        max_w = self.img_size[1]
+        cache_file = os.path.join(self.data_dir, f"img_resized_cache_{self.name}.array")
+        if not os.path.exists(cache_file):
+            logger.info(
+                "Caching images for the first time. This might take about 20 minutes for COCO"
+            )
+            self.imgs = np.memmap(
+                cache_file,
+                shape=(len(self.ids), max_h, max_w, 3),
+                dtype=np.uint8,
+                mode="w+",
+            )
+            from tqdm import tqdm
+            from multiprocessing.pool import ThreadPool
+
+            NUM_THREADs = min(8, os.cpu_count())
+            loaded_images = ThreadPool(NUM_THREADs).imap(
+                lambda x: self.load_resized_img(x),
+                range(len(self.annotations)),
+            )
+            pbar = tqdm(enumerate(loaded_images), total=len(self.annotations))
+            for k, out in pbar:
+                self.imgs[k][: out.shape[0], : out.shape[1], :] = out.copy()
+            self.imgs.flush()
+            pbar.close()
+        else:
+            logger.warning(
+                "You are using cached imgs! Make sure your dataset is not changed!!\n"
+                "Everytime the self.input_size is changed in your exp file, you need to delete\n"
+                "the cached data and re-generate them.\n"
+            )
+
+        logger.info("Loading cached imgs...")
+        self.imgs = np.memmap(
+            cache_file,
+            shape=(len(self.ids), max_h, max_w, 3),
+            dtype=np.uint8,
+            mode="r+",
+        )
+
     def load_anno_from_ids(self, id_):
         im_ann = self.coco.loadImgs(id_)[0]
         width = im_ann["width"]
@@ -66,8 +143,8 @@ def load_anno_from_ids(self, id_):
         for obj in annotations:
             x1 = np.max((0, obj["bbox"][0]))
             y1 = np.max((0, obj["bbox"][1]))
-            x2 = np.min((width - 1, x1 + np.max((0, obj["bbox"][2] - 1))))
-            y2 = np.min((height - 1, y1 + np.max((0, obj["bbox"][3] - 1))))
+            x2 = np.min((width, x1 + np.max((0, obj["bbox"][2]))))
+            y2 = np.min((height, y1 + np.max((0, obj["bbox"][3]))))
             if obj["area"] > 0 and x2 >= x1 and y2 >= y1:
                 obj["clean_bbox"] = [x1, y1, x2, y2]
                 objs.append(obj)
@@ -81,32 +158,56 @@ def load_anno_from_ids(self, id_):
             res[ix, 0:4] = obj["clean_bbox"]
             res[ix, 4] = cls
 
-        img_info = (height, width)
+        r = min(self.img_size[0] / height, self.img_size[1] / width)
+        res[:, :4] *= r
 
-        file_name = im_ann["file_name"] if "file_name" in im_ann else "{:012}".format(id_) + ".jpg"
+        img_info = (height, width)
+        resized_info = (int(height * r), int(width * r))
 
-        del im_ann, annotations
+        file_name = (
+            im_ann["file_name"]
+            if "file_name" in im_ann
+            else "{:012}".format(id_) + ".jpg"
+        )
 
-        return (res, img_info, file_name)
+        return (res, img_info, resized_info, file_name)
 
     def load_anno(self, index):
         return self.annotations[index][0]
 
-    def pull_item(self, index):
-        id_ = self.ids[index]
+    def load_resized_img(self, index):
+        img = self.load_image(index)
+        r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * r), int(img.shape[0] * r)),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.uint8)
+        return resized_img
 
-        res, img_info, file_name = self.annotations[index]
-        # load image and preprocess
-        img_file = os.path.join(
-            self.data_dir, self.name, file_name
-        )
+    def load_image(self, index):
+        file_name = self.annotations[index][3]
+
+        img_file = os.path.join(self.data_dir, self.name, file_name)
 
         img = cv2.imread(img_file)
-        assert img is not None
+        assert img is not None, f"file named {img_file} not found"
+
+        return img
+
+    def pull_item(self, index):
+        id_ = self.ids[index]
+
+        res, img_info, resized_info, _ = self.annotations[index]
+        if self.imgs is not None:
+            pad_img = self.imgs[index]
+            img = pad_img[: resized_info[0], : resized_info[1], :].copy()
+        else:
+            img = self.load_resized_img(index)
 
-        return img, res, img_info, np.array([id_])
+        return img, res.copy(), img_info, np.array([id_])
 
-    @Dataset.resize_getitem
+    @Dataset.mosaic_getitem
     def __getitem__(self, index):
         """
         One image / label pair for the given index is picked up and pre-processed.
@@ -122,10 +223,8 @@ def __getitem__(self, index):
                     class (float): class index.
                     xc, yc (float) : center of bbox whose values range from 0 to 1.
                     w, h (float) : size of bbox whose values range from 0 to 1.
-            info_img : tuple of h, w, nh, nw, dx, dy.
+            info_img : tuple of h, w.
                 h, w (int): original shape of the image
-                nh, nw (int): shape of the resized image without padding
-                dx, dy (int): pad size
             img_id (int): same as the input index. Used for evaluation.
         """
         img, target, img_info, img_id = self.pull_item(index)

From 843ce1bf50365998e4eb0827e17fb665c417eba8 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:27:15 +0800
Subject: [PATCH 36/59] Update voc.py


From 7643143fa00d8e6900200b9bf71f78f2d69b1cc8 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:33:18 +0800
Subject: [PATCH 37/59] Update voc.py

---
 yolox/data/datasets/voc.py | 153 ++++++++++++++++++++++++++++++-------
 1 file changed, 125 insertions(+), 28 deletions(-)

diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py
index 465664aad..09e9833de 100644
--- a/yolox/data/datasets/voc.py
+++ b/yolox/data/datasets/voc.py
@@ -6,15 +6,16 @@
 # Copyright (c) Ellis Brown, Max deGroot.
 # Copyright (c) Megvii, Inc. and its affiliates.
 
-import cv2
-import numpy as np
-
-from yolox.evaluators.voc_eval import voc_eval
-
 import os
 import os.path
 import pickle
 import xml.etree.ElementTree as ET
+from loguru import logger
+
+import cv2
+import numpy as np
+
+from yolox.evaluators.voc_eval import voc_eval
 
 from .datasets_wrapper import Dataset
 from .voc_classes import VOC_CLASSES
@@ -35,7 +36,9 @@ class AnnotationTransform(object):
     """
 
     def __init__(self, class_to_ind=None, keep_difficult=True):
-        self.class_to_ind = class_to_ind or dict(zip(VOC_CLASSES, range(len(VOC_CLASSES))))
+        self.class_to_ind = class_to_ind or dict(
+            zip(VOC_CLASSES, range(len(VOC_CLASSES)))
+        )
         self.keep_difficult = keep_difficult
 
     def __call__(self, target):
@@ -48,16 +51,20 @@ def __call__(self, target):
         """
         res = np.empty((0, 5))
         for obj in target.iter("object"):
-            difficult = int(obj.find("difficult").text) == 1
+            difficult = obj.find("difficult")
+            if difficult is not None:
+                difficult = int(difficult.text) == 1
+            else:
+                difficult = False
             if not self.keep_difficult and difficult:
                 continue
-            name = obj.find("name").text.lower().strip()
+            name = obj.find("name").text.strip()
             bbox = obj.find("bndbox")
 
             pts = ["xmin", "ymin", "xmax", "ymax"]
             bndbox = []
             for i, pt in enumerate(pts):
-                cur_pt = int(bbox.find(pt).text) - 1
+                cur_pt = int(float(bbox.find(pt).text)) - 1
                 # scale height or width
                 # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
                 bndbox.append(cur_pt)
@@ -66,7 +73,11 @@ def __call__(self, target):
             res = np.vstack((res, bndbox))  # [xmin, ymin, xmax, ymax, label_ind]
             # img_id = target.find('filename').text[:-4]
 
-        return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
+        width = int(target.find("size").find("width").text)
+        height = int(target.find("size").find("height").text)
+        img_info = (height, width)
+
+        return res, img_info
 
 
 class VOCDetection(Dataset):
@@ -91,11 +102,12 @@ class VOCDetection(Dataset):
     def __init__(
         self,
         data_dir,
-        image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+        image_sets=[("2007", "trainval"), ("2012", "trainval")],
         img_size=(416, 416),
         preproc=None,
         target_transform=AnnotationTransform(),
         dataset_name="VOC0712",
+        cache=False,
     ):
         super().__init__(img_size)
         self.root = data_dir
@@ -123,17 +135,100 @@ def __init__(
             ):
                 self.ids.append((rootpath, line.strip()))
         """
+        self.annotations = self._load_coco_annotations()
+        self.imgs = None
+        if cache:
+            self._cache_images()
 
     def __len__(self):
         return len(self.ids)
 
-    def load_anno(self, index):
+    def _load_coco_annotations(self):
+        return [self.load_anno_from_ids(_ids) for _ids in range(len(self.ids))]
+
+    def _cache_images(self):
+        logger.warning(
+            "\n********************************************************************************\n"
+            "You are using cached images in RAM to accelerate training.\n"
+            "This requires large system RAM.\n"
+            "Make sure you have 60G+ RAM and 19G available disk space for training VOC.\n"
+            "********************************************************************************\n"
+        )
+        max_h = self.img_size[0]
+        max_w = self.img_size[1]
+        cache_file = os.path.join(self.root, f"img_resized_cache_{self.name}.array")
+        if not os.path.exists(cache_file):
+            logger.info(
+                "Caching images for the first time. This might take about 3 minutes for VOC"
+            )
+            self.imgs = np.memmap(
+                cache_file,
+                shape=(len(self.ids), max_h, max_w, 3),
+                dtype=np.uint8,
+                mode="w+",
+            )
+            from tqdm import tqdm
+            from multiprocessing.pool import ThreadPool
+
+            NUM_THREADs = min(8, os.cpu_count())
+            loaded_images = ThreadPool(NUM_THREADs).imap(
+                lambda x: self.load_resized_img(x),
+                range(len(self.annotations)),
+            )
+            pbar = tqdm(enumerate(loaded_images), total=len(self.annotations))
+            for k, out in pbar:
+                self.imgs[k][: out.shape[0], : out.shape[1], :] = out.copy()
+            self.imgs.flush()
+            pbar.close()
+        else:
+            logger.warning(
+                "You are using cached imgs! Make sure your dataset is not changed!!\n"
+                "Everytime the self.input_size is changed in your exp file, you need to delete\n"
+                "the cached data and re-generate them.\n"
+            )
+
+        logger.info("Loading cached imgs...")
+        self.imgs = np.memmap(
+            cache_file,
+            shape=(len(self.ids), max_h, max_w, 3),
+            dtype=np.uint8,
+            mode="r+",
+        )
+
+    def load_anno_from_ids(self, index):
         img_id = self.ids[index]
         target = ET.parse(self._annopath % img_id).getroot()
-        if self.target_transform is not None:
-            target = self.target_transform(target)
 
-        return target
+        assert self.target_transform is not None
+        res, img_info = self.target_transform(target)
+        height, width = img_info
+
+        r = min(self.img_size[0] / height, self.img_size[1] / width)
+        res[:, :4] *= r
+        resized_info = (int(height * r), int(width * r))
+
+        return (res, img_info, resized_info)
+
+    def load_anno(self, index):
+        return self.annotations[index][0]
+
+    def load_resized_img(self, index):
+        img = self.load_image(index)
+        r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * r), int(img.shape[0] * r)),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.uint8)
+
+        return resized_img
+
+    def load_image(self, index):
+        img_id = self.ids[index]
+        img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
+        assert img is not None, f"file named {self._imgpath % img_id} not found"
+
+        return img
 
     def pull_item(self, index):
         """Returns the original image and target at an index for mixup
@@ -146,17 +241,17 @@ def pull_item(self, index):
         Return:
             img, target
         """
-        img_id = self.ids[index]
-        img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
-        height, width, _ = img.shape
-
-        target = self.load_anno(index)
-
-        img_info = (height, width)
+        if self.imgs is not None:
+            target, img_info, resized_info = self.annotations[index]
+            pad_img = self.imgs[index]
+            img = pad_img[: resized_info[0], : resized_info[1], :].copy()
+        else:
+            img = self.load_resized_img(index)
+            target, img_info, _ = self.annotations[index]
 
         return img, target, img_info, index
 
-    @Dataset.resize_getitem
+    @Dataset.mosaic_getitem
     def __getitem__(self, index):
         img, target, img_info, img_id = self.pull_item(index)
 
@@ -175,7 +270,9 @@ def evaluate_detections(self, all_boxes, output_dir=None):
         all_boxes[class][image] = [] or np.array of shape #dets x 5
         """
         self._write_voc_results_file(all_boxes)
-        IouTh = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
+        IouTh = np.linspace(
+            0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True
+        )
         mAPs = []
         for iou in IouTh:
             mAP = self._do_python_eval(output_dir, iou)
@@ -191,6 +288,7 @@ def _get_voc_results_file_template(self):
         filename = "comp4_det_test" + "_{:s}.txt"
         #filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
         filedir = os.path.join(self.root, "results")
+        #filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
         if not os.path.exists(filedir):
             os.makedirs(filedir)
         path = os.path.join(filedir, filename)
@@ -224,8 +322,8 @@ def _write_voc_results_file(self, all_boxes):
     def _do_python_eval(self, output_dir="output", iou=0.5):
         #rootpath = os.path.join(self.root, "VOC" + self._year)
         rootpath = self.root
-        name = self.image_set[0]
-        annopath = os.path.join(rootpath, "Annotations", "{}.xml")
+        name = self.image_set[0][1]
+        annopath = os.path.join(rootpath, "Annotations", "{:s}.xml")
         imagesetfile = os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
         cachedir = os.path.join(
             #self.root, "annotations_cache", "VOC" + self._year, name
@@ -235,8 +333,7 @@ def _do_python_eval(self, output_dir="output", iou=0.5):
             os.makedirs(cachedir)
         aps = []
         # The PASCAL VOC metric changed in 2010
-        use_07_metric = True
-        #use_07_metric = True if int(self._year) < 2010 else False
+        use_07_metric = True if int(self._year) < 2010 else False
         print("Eval IoU : {:.2f}".format(iou))
         if output_dir is not None and not os.path.isdir(output_dir):
             os.mkdir(output_dir)

From 82e70983f58b3850b65eb6892a21d9ab1570f821 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:34:42 +0800
Subject: [PATCH 38/59] Update visualize.py

---
 yolox/utils/visualize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py
index b31741f3d..4be7d616c 100644
--- a/yolox/utils/visualize.py
+++ b/yolox/utils/visualize.py
@@ -27,7 +27,7 @@ def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
         y1 = int(box[3])
 
         color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
-        text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
+        text = '{:.1f}%'.format(score * 100)#'{}:{:.1f}%'.format(class_names[cls_id], score * 100)
         txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
         font = cv2.FONT_HERSHEY_SIMPLEX
 

From ea7a361350586dcd23d8800b5d22200b497e39c4 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:37:23 +0800
Subject: [PATCH 39/59] Update yolox_base.py

---
 yolox/exp/yolox_base.py | 191 ++++++++++++----------------------------
 1 file changed, 55 insertions(+), 136 deletions(-)

diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
index 5029f42e7..31308a077 100644
--- a/yolox/exp/yolox_base.py
+++ b/yolox/exp/yolox_base.py
@@ -1,14 +1,14 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
-# Copyright (c) Megvii Inc. All rights reserved.
-
-import os
-import random
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 
+import os
+import random
+
 from .base_exp import BaseExp
 
 
@@ -17,106 +17,50 @@ def __init__(self):
         super().__init__()
 
         # ---------------- model config ---------------- #
-        # detect classes number of model
-        self.num_classes = 80
-        # factor of model depth
+        self.num_classes = 1
         self.depth = 1.00
-        # factor of model width
         self.width = 1.00
-        # activation name. For example, if using "relu", then "silu" will be replaced to "relu".
-        """
-        self.act = "silu"
-        """
+
         # ---------------- dataloader config ---------------- #
         # set worker to 4 for shorter dataloader init time
-        # If your training process cost many memory, reduce this value.
         self.data_num_workers = 4
-        self.input_size = (640, 640)  # (height, width)
-        # Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32].
-        # To disable multiscale training, set the value to 0.
-        """
-        self.multiscale_range = 5
-        """
-        # You can uncomment this line to specify a multiscale range
-        # self.random_size = (14, 26)
-        # dir of dataset images, if data_dir is None, this project will use `datasets` dir
-        """
-        self.data_dir = None
-        """
-        # name of annotation file for training
+        self.input_size = (640, 640)
+        self.random_size = (14, 26)
         self.train_ann = "instances_train2017.json"
-        # name of annotation file for evaluation
         self.val_ann = "instances_val2017.json"
-        # name of annotation file for testing
-        """
-        self.test_ann = "instances_test2017.json"
-        """
+
         # --------------- transform config ----------------- #
-        """
-        # prob of applying mosaic aug
-        self.mosaic_prob = 1.0
-        # prob of applying mixup aug
-        self.mixup_prob = 1.0
-        # prob of applying hsv aug
-        self.hsv_prob = 1.0
-        # prob of applying flip aug
-        self.flip_prob = 0.5
-        # rotation angle range, for example, if set to 2, the true range is (-2, 2)
-        """
         self.degrees = 10.0
-        # translate range, for example, if set to 0.1, the true range is (-0.1, 0.1)
         self.translate = 0.1
-        self.mosaic_scale = (0.1, 2)
-        # apply mixup aug or not
-        self.enable_mixup = True
-        self.mixup_scale = (0.5, 1.5)
-        # shear angle range, for example, if set to 2, the true range is (-2, 2)
+        self.scale = (0.1, 2)
+        self.mscale = (0.8, 1.6)
         self.shear = 2.0
+        self.perspective = 0.0
+        self.enable_mixup = True
 
-        # --------------  training config --------------------- #
-        # epoch number used for warmup
+        # --------------  training config --------------------- #s
         self.warmup_epochs = 5
-        # max training epoch
-        self.max_epoch = 300
-        # minimum learning rate during warmup
+        self.max_epoch = 500
         self.warmup_lr = 0
-        self.min_lr_ratio = 0.05
-        # learning rate for one image. During training, lr will multiply batchsize.
         self.basic_lr_per_img = 0.01 / 64.0
-        # name of LRScheduler
         self.scheduler = "yoloxwarmcos"
-        # last #epoch to close augmention like mosaic
-        self.no_aug_epochs = 0
-        # apply EMA during training
+        self.no_aug_epochs = 0#15
+        self.min_lr_ratio = 0.05
         self.ema = True
 
-        # weight decay of optimizer
         self.weight_decay = 5e-4
-        # momentum of optimizer
         self.momentum = 0.9
-        # log period in iter, for example,
-        # if set to 1, user could see log every iteration.
         self.print_interval = 10
-        # eval period in epoch, for example,
-        # if set to 1, model will be evaluate after every epoch.
         self.eval_interval = 10
-        # save history checkpoint or not.
-        # If set to False, yolox will only save latest and best ckpt.
-        self.save_history_ckpt = True
-        # name of experiment
         self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
         # -----------------  testing config ------------------ #
-        # output image size during evaluation/test
         self.test_size = (640, 640)
-        # confidence threshold during evaluation/test,
-        # boxes whose scores are less than test_conf will be filtered
         self.test_conf = 0.01
-        # nms threshold
         self.nmsthre = 0.65
 
     def get_model(self):
-        from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+        from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead
 
         def init_yolo(M):
             for m in M.modules():
@@ -126,55 +70,50 @@ def init_yolo(M):
 
         if getattr(self, "model", None) is None:
             in_channels = [256, 512, 1024]
-            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act)
-            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act)
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels)
             self.model = YOLOX(backbone, head)
 
         self.model.apply(init_yolo)
         self.model.head.initialize_biases(1e-2)
-        self.model.train()
         return self.model
 
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
         from yolox.data import (
             COCODataset,
-            TrainTransform,
-            YoloBatchSampler,
             DataLoader,
             InfiniteSampler,
             MosaicDetection,
-            worker_init_reset_seed,
+            TrainTransform,
+            YoloBatchSampler
+        )
+
+        dataset = COCODataset(
+            data_dir=None,
+            json_file=self.train_ann,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
         )
-        from yolox.utils import wait_for_the_master
-
-        with wait_for_the_master():
-            dataset = COCODataset(
-                data_dir=self.data_dir,
-                json_file=self.train_ann,
-                img_size=self.input_size,
-                preproc=TrainTransform(
-                    max_labels=50,
-                    flip_prob=self.flip_prob,
-                    hsv_prob=self.hsv_prob),
-                cache=cache_img,
-            )
 
         dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
             preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
                 max_labels=120,
-                flip_prob=self.flip_prob,
-                hsv_prob=self.hsv_prob),
+            ),
             degrees=self.degrees,
             translate=self.translate,
-            mosaic_scale=self.mosaic_scale,
-            mixup_scale=self.mixup_scale,
+            scale=self.scale,
             shear=self.shear,
+            perspective=self.perspective,
             enable_mixup=self.enable_mixup,
-            mosaic_prob=self.mosaic_prob,
-            mixup_prob=self.mixup_prob,
         )
 
         self.dataset = dataset
@@ -188,16 +127,12 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=Fa
             sampler=sampler,
             batch_size=batch_size,
             drop_last=False,
+            input_dimension=self.input_size,
             mosaic=not no_aug,
         )
 
         dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
         dataloader_kwargs["batch_sampler"] = batch_sampler
-
-        # Make sure each process has different random seed, especially for 'fork' method.
-        # Check https://github.com/pytorch/pytorch/issues/63311 for more details.
-        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
-
         train_loader = DataLoader(self.dataset, **dataloader_kwargs)
 
         return train_loader
@@ -207,10 +142,6 @@ def random_resize(self, data_loader, epoch, rank, is_distributed):
 
         if rank == 0:
             size_factor = self.input_size[1] * 1.0 / self.input_size[0]
-            if not hasattr(self, 'random_size'):
-                min_size = int(self.input_size[0] / 32) - self.multiscale_range
-                max_size = int(self.input_size[0] / 32) + self.multiscale_range
-                self.random_size = (min_size, max_size)
             size = random.randint(*self.random_size)
             size = (int(32 * size), 32 * int(size * size_factor))
             tensor[0] = size[0]
@@ -220,20 +151,11 @@ def random_resize(self, data_loader, epoch, rank, is_distributed):
             dist.barrier()
             dist.broadcast(tensor, 0)
 
-        input_size = (tensor[0].item(), tensor[1].item())
+        input_size = data_loader.change_input_dim(
+            multiple=(tensor[0].item(), tensor[1].item()), random_range=None
+        )
         return input_size
 
-    def preprocess(self, inputs, targets, tsize):
-        scale_y = tsize[0] / self.input_size[0]
-        scale_x = tsize[1] / self.input_size[1]
-        if scale_x != 1 or scale_y != 1:
-            inputs = nn.functional.interpolate(
-                inputs, size=tsize, mode="bilinear", align_corners=False
-            )
-            targets[..., 1::2] = targets[..., 1::2] * scale_x
-            targets[..., 2::2] = targets[..., 2::2] * scale_y
-        return inputs, targets
-
     def get_optimizer(self, batch_size):
         if "optimizer" not in self.__dict__:
             if self.warmup_epochs > 0:
@@ -277,15 +199,18 @@ def get_lr_scheduler(self, lr, iters_per_epoch):
         )
         return scheduler
 
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
         from yolox.data import COCODataset, ValTransform
 
         valdataset = COCODataset(
-            data_dir=self.data_dir,
-            json_file=self.val_ann if not testdev else self.test_ann,
-            name="val2017" if not testdev else "test2017",
+            data_dir=None,
+            json_file=self.val_ann if not testdev else "image_info_test-dev2017.json",
+            #name="val2017" if not testdev else "test2017",
+            name="valid" if not testdev else "test",
             img_size=self.test_size,
-            preproc=ValTransform(legacy=legacy),
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
+            ),
         )
 
         if is_distributed:
@@ -306,10 +231,10 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=Fals
 
         return val_loader
 
-    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
         from yolox.evaluators import COCOEvaluator
 
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
         evaluator = COCOEvaluator(
             dataloader=val_loader,
             img_size=self.test_size,
@@ -320,11 +245,5 @@ def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False)
         )
         return evaluator
 
-    def get_trainer(self, args):
-        from yolox.core import Trainer
-        trainer = Trainer(self, args)
-        # NOTE: trainer shouldn't be an attribute of exp object
-        return trainer
-
-    def eval(self, model, evaluator, is_distributed, half=False, return_outputs=False):
-        return evaluator.evaluate(model, is_distributed, half, return_outputs=return_outputs)
+    def eval(self, model, evaluator, is_distributed, half=False):
+        return evaluator.evaluate(model, is_distributed, half)

From fdc26eb6f8a01af264e8ac8266d9e17f18233f93 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:41:59 +0800
Subject: [PATCH 40/59] Update yolox_base.py

---
 yolox/exp/yolox_base.py | 176 ++++++++++++++++++++++++++++------------
 1 file changed, 125 insertions(+), 51 deletions(-)

diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
index 31308a077..c96e195d0 100644
--- a/yolox/exp/yolox_base.py
+++ b/yolox/exp/yolox_base.py
@@ -1,14 +1,14 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
-# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import os
+import random
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 
-import os
-import random
-
 from .base_exp import BaseExp
 
 
@@ -17,50 +17,98 @@ def __init__(self):
         super().__init__()
 
         # ---------------- model config ---------------- #
-        self.num_classes = 1
+        # detect classes number of model
+        self.num_classes = 80
+        # factor of model depth
         self.depth = 1.00
+        # factor of model width
         self.width = 1.00
+        # activation name. For example, if using "relu", then "silu" will be replaced to "relu".
+        self.act = "silu"
 
         # ---------------- dataloader config ---------------- #
         # set worker to 4 for shorter dataloader init time
+        # If your training process cost many memory, reduce this value.
         self.data_num_workers = 4
-        self.input_size = (640, 640)
-        self.random_size = (14, 26)
+        self.input_size = (640, 640)  # (height, width)
+        # Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32].
+        # To disable multiscale training, set the value to 0.
+        self.multiscale_range = 5
+        # You can uncomment this line to specify a multiscale range
+        # self.random_size = (14, 26)
+        # dir of dataset images, if data_dir is None, this project will use `datasets` dir
+        self.data_dir = None
+        # name of annotation file for training
         self.train_ann = "instances_train2017.json"
+        # name of annotation file for evaluation
         self.val_ann = "instances_val2017.json"
+        # name of annotation file for testing
+        self.test_ann = "instances_test2017.json"
 
         # --------------- transform config ----------------- #
+        # prob of applying mosaic aug
+        self.mosaic_prob = 1.0
+        # prob of applying mixup aug
+        self.mixup_prob = 1.0
+        # prob of applying hsv aug
+        self.hsv_prob = 1.0
+        # prob of applying flip aug
+        self.flip_prob = 0.5
+        # rotation angle range, for example, if set to 2, the true range is (-2, 2)
         self.degrees = 10.0
+        # translate range, for example, if set to 0.1, the true range is (-0.1, 0.1)
         self.translate = 0.1
-        self.scale = (0.1, 2)
-        self.mscale = (0.8, 1.6)
-        self.shear = 2.0
-        self.perspective = 0.0
+        self.mosaic_scale = (0.1, 2)
+        # apply mixup aug or not
         self.enable_mixup = True
+        self.mixup_scale = (0.5, 1.5)
+        # shear angle range, for example, if set to 2, the true range is (-2, 2)
+        self.shear = 2.0
 
-        # --------------  training config --------------------- #s
+        # --------------  training config --------------------- #
+        # epoch number used for warmup
         self.warmup_epochs = 5
-        self.max_epoch = 500
+        # max training epoch
+        self.max_epoch = 300
+        # minimum learning rate during warmup
         self.warmup_lr = 0
+        self.min_lr_ratio = 0.05
+        # learning rate for one image. During training, lr will multiply batchsize.
         self.basic_lr_per_img = 0.01 / 64.0
+        # name of LRScheduler
         self.scheduler = "yoloxwarmcos"
-        self.no_aug_epochs = 0#15
-        self.min_lr_ratio = 0.05
+        # last #epoch to close augmention like mosaic
+        self.no_aug_epochs = 15
+        # apply EMA during training
         self.ema = True
 
+        # weight decay of optimizer
         self.weight_decay = 5e-4
+        # momentum of optimizer
         self.momentum = 0.9
+        # log period in iter, for example,
+        # if set to 1, user could see log every iteration.
         self.print_interval = 10
+        # eval period in epoch, for example,
+        # if set to 1, model will be evaluate after every epoch.
         self.eval_interval = 10
+        # save history checkpoint or not.
+        # If set to False, yolox will only save latest and best ckpt.
+        self.save_history_ckpt = True
+        # name of experiment
         self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
         # -----------------  testing config ------------------ #
+        # output image size during evaluation/test
         self.test_size = (640, 640)
+        # confidence threshold during evaluation/test,
+        # boxes whose scores are less than test_conf will be filtered
         self.test_conf = 0.01
+        # nms threshold
         self.nmsthre = 0.65
 
     def get_model(self):
-        from yolox.models import YOLOPAFPN, YOLOX, YOLOXHead
+        from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
 
         def init_yolo(M):
             for m in M.modules():
@@ -70,50 +118,55 @@ def init_yolo(M):
 
         if getattr(self, "model", None) is None:
             in_channels = [256, 512, 1024]
-            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels)
-            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels)
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act)
             self.model = YOLOX(backbone, head)
 
         self.model.apply(init_yolo)
         self.model.head.initialize_biases(1e-2)
+        self.model.train()
         return self.model
 
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
         from yolox.data import (
             COCODataset,
+            TrainTransform,
+            YoloBatchSampler,
             DataLoader,
             InfiniteSampler,
             MosaicDetection,
-            TrainTransform,
-            YoloBatchSampler
-        )
-
-        dataset = COCODataset(
-            data_dir=None,
-            json_file=self.train_ann,
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
+            worker_init_reset_seed,
         )
+        from yolox.utils import wait_for_the_master
+
+        with wait_for_the_master():
+            dataset = COCODataset(
+                data_dir=self.data_dir,
+                json_file=self.train_ann,
+                img_size=self.input_size,
+                preproc=TrainTransform(
+                    max_labels=50,
+                    flip_prob=self.flip_prob,
+                    hsv_prob=self.hsv_prob),
+                cache=cache_img,
+            )
 
         dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
             preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
                 max_labels=120,
-            ),
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob),
             degrees=self.degrees,
             translate=self.translate,
-            scale=self.scale,
+            mosaic_scale=self.mosaic_scale,
+            mixup_scale=self.mixup_scale,
             shear=self.shear,
-            perspective=self.perspective,
             enable_mixup=self.enable_mixup,
+            mosaic_prob=self.mosaic_prob,
+            mixup_prob=self.mixup_prob,
         )
 
         self.dataset = dataset
@@ -127,12 +180,16 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             sampler=sampler,
             batch_size=batch_size,
             drop_last=False,
-            input_dimension=self.input_size,
             mosaic=not no_aug,
         )
 
         dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
         dataloader_kwargs["batch_sampler"] = batch_sampler
+
+        # Make sure each process has different random seed, especially for 'fork' method.
+        # Check https://github.com/pytorch/pytorch/issues/63311 for more details.
+        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
         train_loader = DataLoader(self.dataset, **dataloader_kwargs)
 
         return train_loader
@@ -142,6 +199,10 @@ def random_resize(self, data_loader, epoch, rank, is_distributed):
 
         if rank == 0:
             size_factor = self.input_size[1] * 1.0 / self.input_size[0]
+            if not hasattr(self, 'random_size'):
+                min_size = int(self.input_size[0] / 32) - self.multiscale_range
+                max_size = int(self.input_size[0] / 32) + self.multiscale_range
+                self.random_size = (min_size, max_size)
             size = random.randint(*self.random_size)
             size = (int(32 * size), 32 * int(size * size_factor))
             tensor[0] = size[0]
@@ -151,11 +212,20 @@ def random_resize(self, data_loader, epoch, rank, is_distributed):
             dist.barrier()
             dist.broadcast(tensor, 0)
 
-        input_size = data_loader.change_input_dim(
-            multiple=(tensor[0].item(), tensor[1].item()), random_range=None
-        )
+        input_size = (tensor[0].item(), tensor[1].item())
         return input_size
 
+    def preprocess(self, inputs, targets, tsize):
+        scale_y = tsize[0] / self.input_size[0]
+        scale_x = tsize[1] / self.input_size[1]
+        if scale_x != 1 or scale_y != 1:
+            inputs = nn.functional.interpolate(
+                inputs, size=tsize, mode="bilinear", align_corners=False
+            )
+            targets[..., 1::2] = targets[..., 1::2] * scale_x
+            targets[..., 2::2] = targets[..., 2::2] * scale_y
+        return inputs, targets
+
     def get_optimizer(self, batch_size):
         if "optimizer" not in self.__dict__:
             if self.warmup_epochs > 0:
@@ -199,18 +269,16 @@ def get_lr_scheduler(self, lr, iters_per_epoch):
         )
         return scheduler
 
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.data import COCODataset, ValTransform
 
         valdataset = COCODataset(
-            data_dir=None,
-            json_file=self.val_ann if not testdev else "image_info_test-dev2017.json",
+            data_dir=self.data_dir,
+            json_file=self.val_ann if not testdev else self.test_ann,
             #name="val2017" if not testdev else "test2017",
             name="valid" if not testdev else "test",
             img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
-            ),
+            preproc=ValTransform(legacy=legacy),
         )
 
         if is_distributed:
@@ -231,10 +299,10 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False):
 
         return val_loader
 
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.evaluators import COCOEvaluator
 
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
         evaluator = COCOEvaluator(
             dataloader=val_loader,
             img_size=self.test_size,
@@ -245,5 +313,11 @@ def get_evaluator(self, batch_size, is_distributed, testdev=False):
         )
         return evaluator
 
-    def eval(self, model, evaluator, is_distributed, half=False):
-        return evaluator.evaluate(model, is_distributed, half)
+    def get_trainer(self, args):
+        from yolox.core import Trainer
+        trainer = Trainer(self, args)
+        # NOTE: trainer shouldn't be an attribute of exp object
+        return trainer
+
+    def eval(self, model, evaluator, is_distributed, half=False, return_outputs=False):
+        return evaluator.evaluate(model, is_distributed, half, return_outputs=return_outputs)

From 18a6d9e5b031b78cf8dc44fe8828c40d9d9ad7f6 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:54:16 +0800
Subject: [PATCH 41/59] Update yolox_voc_nano.py

---
 .../custom/voc_format/yolox_voc_nano/yolox_voc_nano.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
index 5e4fb127f..0fb62da78 100644
--- a/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
+++ b/exps/example/custom/voc_format/yolox_voc_nano/yolox_voc_nano.py
@@ -67,8 +67,8 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=Fa
 
         with wait_for_the_master(local_rank):
             dataset = VOCDetection(
-                data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-                image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+                data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+                image_sets=[('train')],
                 img_size=self.input_size,
                 preproc=TrainTransform(
                     max_labels=50,
@@ -125,8 +125,8 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=Fals
         from yolox.data import VOCDetection, ValTransform
 
         valdataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
-            image_sets=[('2007', 'test')],
+            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+            image_sets=[('valid')],
             img_size=self.test_size,
             preproc=ValTransform(legacy=legacy),
         )

From 2adbb068406a25f9540ccf5bc3ad9f8f441aee36 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:57:49 +0800
Subject: [PATCH 42/59] Update yolox_voc_nano.py

---
 .../voc_format/yolox_voc_nano.py              | 82 +++++++++++--------
 1 file changed, 50 insertions(+), 32 deletions(-)

diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
index a3d02e62a..0fb62da78 100644
--- a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
@@ -2,23 +2,33 @@
 # -*- coding:utf-8 -*-
 # Copyright (c) Megvii, Inc. and its affiliates.
 
-from yolox.data import get_yolox_datadir
-from yolox.exp import Exp as MyExp
 import os
-import random
-import torch.nn as nn
+
 import torch
 import torch.distributed as dist
 
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+
 
 class Exp(MyExp):
     def __init__(self):
         super(Exp, self).__init__()
-        self.num_classes = 1
+        self.num_classes = 20
         self.depth = 0.33
         self.width = 0.25
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
+        self.random_size = (10, 20)
+        self.test_size = (416, 416)
+        self.warmup_epochs = 1
+        # ---------- transform config ------------ #
+        #self.mosaic_prob = 1.0
         self.enable_mixup = False
+        #self.mixup_prob = 1.0
+        #self.hsv_prob = 1.0
+        #self.flip_prob = 0.5
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
     def get_model(self, sublinear=False):
 
@@ -39,7 +49,7 @@ def init_yolo(M):
         self.model.head.initialize_biases(1e-2)
         return self.model
 
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+   def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
         from yolox.data import (
             VOCDetection,
             TrainTransform,
@@ -47,40 +57,48 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             DataLoader,
             InfiniteSampler,
             MosaicDetection,
+            worker_init_reset_seed,
         )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
-            image_sets=[('train')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
+        from yolox.utils import (
+            wait_for_the_master,
+            get_local_rank,
         )
+        local_rank = get_local_rank()
+
+        with wait_for_the_master(local_rank):
+            dataset = VOCDetection(
+                data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+                image_sets=[('train')],
+                img_size=self.input_size,
+                preproc=TrainTransform(
+                    max_labels=50,
+                    flip_prob=self.flip_prob,
+                    hsv_prob=self.hsv_prob),
+                cache=cache_img,
+            )
 
         dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
             preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
                 max_labels=120,
-            ),
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob),
             degrees=self.degrees,
             translate=self.translate,
-            scale=self.scale,
+            mosaic_scale=self.mosaic_scale,
+            mixup_scale=self.mixup_scale,
             shear=self.shear,
-            perspective=self.perspective,
             enable_mixup=self.enable_mixup,
+            mosaic_prob=self.mosaic_prob,
+            mixup_prob=self.mixup_prob,
         )
 
         self.dataset = dataset
 
         if is_distributed:
-             batch_size =  batch_size // dist.get_world_size()
+            batch_size = batch_size // dist.get_world_size()
 
         sampler = InfiniteSampler(
             len(self.dataset), seed=self.seed if self.seed else 0
@@ -90,31 +108,31 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             sampler=sampler,
             batch_size=batch_size,
             drop_last=False,
-            input_dimension=self.input_size,
             mosaic=not no_aug,
         )
 
         dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
         dataloader_kwargs["batch_sampler"] = batch_sampler
+
+        # Make sure each process has different random seed, especially for 'fork' method
+        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
         train_loader = DataLoader(self.dataset, **dataloader_kwargs)
 
         return train_loader
 
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.data import VOCDetection, ValTransform
 
         valdataset = VOCDetection(
             data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
             image_sets=[('valid')],
             img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
+            preproc=ValTransform(legacy=legacy),
         )
 
         if is_distributed:
-            batch_size =  batch_size // dist.get_world_size()
+            batch_size = batch_size // dist.get_world_size()
             sampler = torch.utils.data.distributed.DistributedSampler(
                 valdataset, shuffle=False
             )
@@ -131,10 +149,10 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False):
 
         return val_loader
 
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.evaluators import VOCEvaluator
 
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
         evaluator = VOCEvaluator(
             dataloader=val_loader,
             img_size=self.test_size,

From f3a902bfea311e2972aa03fd20cf45162f01ba2e Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 14:58:26 +0800
Subject: [PATCH 43/59] Update yolox_voc_nano.py

---
 exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
index 0fb62da78..c38679cca 100644
--- a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano.py
@@ -14,13 +14,13 @@
 class Exp(MyExp):
     def __init__(self):
         super(Exp, self).__init__()
-        self.num_classes = 20
+        self.num_classes = 1
         self.depth = 0.33
         self.width = 0.25
-        self.input_size = (416, 416)
+        #self.input_size = (416, 416)
         self.mosaic_scale = (0.5, 1.5)
         self.random_size = (10, 20)
-        self.test_size = (416, 416)
+        #self.test_size = (416, 416)
         self.warmup_epochs = 1
         # ---------- transform config ------------ #
         #self.mosaic_prob = 1.0

From 58dd583b129b35f7698b2bd6ad57ce761cc674a9 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 15:02:00 +0800
Subject: [PATCH 44/59] Update yolox_voc_s.py

---
 .../voc_format/yolox_voc_s.py                 | 69 +++++++++++--------
 1 file changed, 42 insertions(+), 27 deletions(-)

diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py
index fa27310ab..4801f559d 100644
--- a/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_s.py
@@ -1,12 +1,11 @@
 # encoding: utf-8
 import os
-import random
+
 import torch
-import torch.nn as nn
 import torch.distributed as dist
 
-from yolox.exp import Exp as MyExp
 from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
 
 
 class Exp(MyExp):
@@ -15,9 +14,17 @@ def __init__(self):
         self.num_classes = 1
         self.depth = 0.33
         self.width = 0.50
+        self.warmup_epochs = 1
+
+        # ---------- transform config ------------ #
+        self.mosaic_prob = 1.0
+        self.mixup_prob = 1.0
+        self.hsv_prob = 1.0
+        self.flip_prob = 0.5
+
         self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
         from yolox.data import (
             VOCDetection,
             TrainTransform,
@@ -25,34 +32,42 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             DataLoader,
             InfiniteSampler,
             MosaicDetection,
+            worker_init_reset_seed,
         )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
-            image_sets=[('train')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
+        from yolox.utils import (
+            wait_for_the_master,
+            get_local_rank,
         )
+        local_rank = get_local_rank()
+
+        with wait_for_the_master(local_rank):
+            dataset = VOCDetection(
+                data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+                image_sets=[('train')],
+                img_size=self.input_size,
+                preproc=TrainTransform(
+                    max_labels=50,
+                    flip_prob=self.flip_prob,
+                    hsv_prob=self.hsv_prob),
+                cache=cache_img,
+            )
 
         dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
             preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
                 max_labels=120,
-            ),
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob),
             degrees=self.degrees,
             translate=self.translate,
-            scale=self.scale,
+            mosaic_scale=self.mosaic_scale,
+            mixup_scale=self.mixup_scale,
             shear=self.shear,
-            perspective=self.perspective,
             enable_mixup=self.enable_mixup,
+            mosaic_prob=self.mosaic_prob,
+            mixup_prob=self.mixup_prob,
         )
 
         self.dataset = dataset
@@ -68,27 +83,27 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             sampler=sampler,
             batch_size=batch_size,
             drop_last=False,
-            input_dimension=self.input_size,
             mosaic=not no_aug,
         )
 
         dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
         dataloader_kwargs["batch_sampler"] = batch_sampler
+
+        # Make sure each process has different random seed, especially for 'fork' method
+        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
         train_loader = DataLoader(self.dataset, **dataloader_kwargs)
 
         return train_loader
 
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.data import VOCDetection, ValTransform
 
         valdataset = VOCDetection(
             data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
             image_sets=[('valid')],
             img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
+            preproc=ValTransform(legacy=legacy),
         )
 
         if is_distributed:
@@ -109,10 +124,10 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False):
 
         return val_loader
 
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.evaluators import VOCEvaluator
 
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
         evaluator = VOCEvaluator(
             dataloader=val_loader,
             img_size=self.test_size,

From 8dd855bd1ef6d651457d681480d45f56b82678ba Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 15:04:03 +0800
Subject: [PATCH 45/59] Update yolox_voc_nano_adam.py

---
 .../voc_format/yolox_voc_nano_adam.py         | 81 +++++++++++--------
 1 file changed, 48 insertions(+), 33 deletions(-)

diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
index 8e05e67ff..e35d8bd2f 100644
--- a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
@@ -3,25 +3,33 @@
 # Copyright (c) Megvii, Inc. and its affiliates.
 
 import os
-import random
-import torch.nn as nn
+
 import torch
 import torch.distributed as dist
-from yolox.exp import Exp as MyExp
+
 from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
 
 
 class Exp(MyExp):
     def __init__(self):
         super(Exp, self).__init__()
-        self.num_classes = 1
+        self.num_classes = 20
         self.depth = 0.33
         self.width = 0.25
-        self.scale = (0.5, 1.5)
+        self.input_size = (416, 416)
+        self.mosaic_scale = (0.5, 1.5)
         self.random_size = (10, 20)
+        self.test_size = (416, 416)
         self.eps = 1e-8
-        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+        self.warmup_epochs = 1
+        # ---------- transform config ------------ #
+        #self.mosaic_prob = 1.0
         self.enable_mixup = False
+        #self.mixup_prob = 1.0
+        #self.hsv_prob = 1.0
+        #self.flip_prob = 0.5
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
 
     def get_model(self, sublinear=False):
 
@@ -41,7 +49,7 @@ def init_yolo(M):
         self.model.apply(init_yolo)
         self.model.head.initialize_biases(1e-2)
         return self.model
-
+        
     def get_optimizer(self, batch_size):
         if "optimizer" not in self.__dict__:
             if self.warmup_epochs > 0:
@@ -67,10 +75,9 @@ def get_optimizer(self, batch_size):
             )  # add pg1 with weight_decay
             optimizer.add_param_group({"params": pg2})
             self.optimizer = optimizer
-
         return self.optimizer
-    
-    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        
+   def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
         from yolox.data import (
             VOCDetection,
             TrainTransform,
@@ -78,34 +85,42 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             DataLoader,
             InfiniteSampler,
             MosaicDetection,
+            worker_init_reset_seed,
         )
-
-        dataset = VOCDetection(
-            data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
-            image_sets=[('train')],
-            img_size=self.input_size,
-            preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-                max_labels=50,
-            ),
+        from yolox.utils import (
+            wait_for_the_master,
+            get_local_rank,
         )
+        local_rank = get_local_rank()
+
+        with wait_for_the_master(local_rank):
+            dataset = VOCDetection(
+                data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
+                image_sets=[('train')],
+                img_size=self.input_size,
+                preproc=TrainTransform(
+                    max_labels=50,
+                    flip_prob=self.flip_prob,
+                    hsv_prob=self.hsv_prob),
+                cache=cache_img,
+            )
 
         dataset = MosaicDetection(
             dataset,
             mosaic=not no_aug,
             img_size=self.input_size,
             preproc=TrainTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
                 max_labels=120,
-            ),
+                flip_prob=self.flip_prob,
+                hsv_prob=self.hsv_prob),
             degrees=self.degrees,
             translate=self.translate,
-            scale=self.scale,
+            mosaic_scale=self.mosaic_scale,
+            mixup_scale=self.mixup_scale,
             shear=self.shear,
-            perspective=self.perspective,
             enable_mixup=self.enable_mixup,
+            mosaic_prob=self.mosaic_prob,
+            mixup_prob=self.mixup_prob,
         )
 
         self.dataset = dataset
@@ -121,27 +136,27 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False):
             sampler=sampler,
             batch_size=batch_size,
             drop_last=False,
-            input_dimension=self.input_size,
             mosaic=not no_aug,
         )
 
         dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
         dataloader_kwargs["batch_sampler"] = batch_sampler
+
+        # Make sure each process has different random seed, especially for 'fork' method
+        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
         train_loader = DataLoader(self.dataset, **dataloader_kwargs)
 
         return train_loader
 
-    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.data import VOCDetection, ValTransform
 
         valdataset = VOCDetection(
             data_dir=os.path.join(get_yolox_datadir(), "pedestrian_voc"),
             image_sets=[('valid')],
             img_size=self.test_size,
-            preproc=ValTransform(
-                rgb_means=(0.485, 0.456, 0.406),
-                std=(0.229, 0.224, 0.225),
-            ),
+            preproc=ValTransform(legacy=legacy),
         )
 
         if is_distributed:
@@ -162,10 +177,10 @@ def get_eval_loader(self, batch_size, is_distributed, testdev=False):
 
         return val_loader
 
-    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
         from yolox.evaluators import VOCEvaluator
 
-        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
         evaluator = VOCEvaluator(
             dataloader=val_loader,
             img_size=self.test_size,

From 79c33158e364543c6897da51e07e07b8dc583f3a Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 15:05:16 +0800
Subject: [PATCH 46/59] Update yolox_voc_nano_adam.py

---
 exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
index e35d8bd2f..fb5cb1e60 100644
--- a/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
+++ b/exps/example/yolox_pedestrian/voc_format/yolox_voc_nano_adam.py
@@ -14,7 +14,7 @@
 class Exp(MyExp):
     def __init__(self):
         super(Exp, self).__init__()
-        self.num_classes = 20
+        self.num_classes = 1
         self.depth = 0.33
         self.width = 0.25
         self.input_size = (416, 416)

From 33954ea25eed0d0e4ee15394cb436cf3af442b76 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 15:59:41 +0800
Subject: [PATCH 47/59] Update demo.py

---
 tools/demo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/demo.py b/tools/demo.py
index 0e966900c..1730d82ae 100644
--- a/tools/demo.py
+++ b/tools/demo.py
@@ -118,7 +118,7 @@ def __init__(
         self.test_size = exp.test_size
         self.device = device
         self.fp16 = fp16
-        self.preproc = ValTransform(legacy=legacy)
+        self.preproc = ValTransform(legacy)#=legacy)
         if trt_file is not None:
             from torch2trt import TRTModule
 
@@ -160,7 +160,7 @@ def inference(self, img):
                 outputs = self.decoder(outputs, dtype=outputs.type())
             outputs = postprocess(
                 outputs, self.num_classes, self.confthre,
-                self.nmsthre, class_agnostic=True
+                self.nmsthre#, class_agnostic=True
             )
             logger.info("Infer time: {:.4f}s".format(time.time() - t0))
         return outputs, img_info

From f983881681a63f5fbae9ddafdd2f5215762582d4 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 16:22:40 +0800
Subject: [PATCH 48/59] Update requirements.txt

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7227f09b4..46efe646f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,6 @@ tabulate
 # verified versions
 # pycocotools corresponds to https://github.com/ppwwyyxx/cocoapi
 pycocotools>=2.0.2
-onnx==1.8.1
-onnxruntime==1.8.0
+onnx>=1.8.1
+onnxruntime>=1.8.0
 onnx-simplifier==0.3.5

From 577c62f6572ea54635aa6715e4f667bd2926ae76 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Sun, 10 Jul 2022 19:21:38 +0800
Subject: [PATCH 49/59] Update demo.py

---
 tools/demo.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/demo.py b/tools/demo.py
index 1730d82ae..8588b1e31 100644
--- a/tools/demo.py
+++ b/tools/demo.py
@@ -118,7 +118,7 @@ def __init__(
         self.test_size = exp.test_size
         self.device = device
         self.fp16 = fp16
-        self.preproc = ValTransform(legacy)#=legacy)
+        self.preproc = ValTransform(legacy=legacy)
         if trt_file is not None:
             from torch2trt import TRTModule
 
@@ -160,7 +160,7 @@ def inference(self, img):
                 outputs = self.decoder(outputs, dtype=outputs.type())
             outputs = postprocess(
                 outputs, self.num_classes, self.confthre,
-                self.nmsthre#, class_agnostic=True
+                self.nmsthre, class_agnostic=True
             )
             logger.info("Infer time: {:.4f}s".format(time.time() - t0))
         return outputs, img_info
@@ -190,9 +190,11 @@ def visual(self, output, img_info, cls_conf=0.35):
 
         cls = output[:, 6]
         scores = output[:, 4] * output[:, 5]
+
         vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
         return vis_res
 
+
 def image_demo(predictor, vis_folder, path, current_time, save_result):
     if os.path.isdir(path):
         files = get_image_list(path)
@@ -243,7 +245,7 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
             else:
                 cv2.namedWindow("yolox", cv2.WINDOW_NORMAL)
                 cv2.imshow("yolox", result_frame)
-            ch = cv2.waitKey(1000)
+            ch = cv2.waitKey(1)
             if ch == 27 or ch == ord("q") or ch == ord("Q"):
                 break
         else:

From 16eb42a8cfcf1693063588dba4aae7b8391638d3 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Mon, 11 Jul 2022 15:33:15 +0800
Subject: [PATCH 50/59] Update data_augment.py

---
 yolox/data/data_augment.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/yolox/data/data_augment.py b/yolox/data/data_augment.py
index 21cd7b56d..f4da7580b 100644
--- a/yolox/data/data_augment.py
+++ b/yolox/data/data_augment.py
@@ -157,6 +157,13 @@ def preproc(img, input_size, swap=(2, 0, 1)):
     padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
     return padded_img, r
 
+def sliding_window(image, YstepSize, XstepSize, windowSize):
+	# slide a window across the image
+	for y in range(0, image.shape[0], YstepSize):
+		for x in range(0, image.shape[1], XstepSize):
+			# yield the current window
+			yield (x, y, image[y:y + windowSize[1], x:x + windowSize[0]])
+
 
 class TrainTransform:
     def __init__(self, max_labels=50, flip_prob=0.5, hsv_prob=1.0):

From 8c9bba339eee2ab76f1683b74cd0709e682e49d0 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Mon, 11 Jul 2022 22:52:30 +0800
Subject: [PATCH 51/59] Update visualize.py

---
 yolox/utils/visualize.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py
index 4be7d616c..72eba4d65 100644
--- a/yolox/utils/visualize.py
+++ b/yolox/utils/visualize.py
@@ -45,15 +45,24 @@ def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
         class_count[class_names[cls_id]] = class_count[class_names[cls_id]]+1
         class_AP[class_names[cls_id]] = class_AP[class_names[cls_id]]+float('{:.1f}'.format(score * 100))
         cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
-        line = 0
-        for k in class_count:  
-            cv2.putText(img, str(k)+": "+str(class_count[k]), (15,25+line), font, 0.8, (0, 255, 255), thickness=2)
-            if class_count[k] !=0:
-                class_AP[k]=class_AP[k]/class_count[k]
-            else:
-                class_AP[k]=0.0
-            cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (15,50+line), font, 0.8, (0, 255, 255), thickness=2)
-            line = line+50
+        
+    x0 = 15
+    y0 = 0
+    row = 0
+    for k in class_count: 
+        if((y0+row+50)>=img.shape[0]):
+            x0 = x0+200
+            y0 = 25
+            row = 0
+        else:
+            row = row+25
+        cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+        if class_count[k] !=0:
+            class_AP[k]=class_AP[k]/class_count[k]
+        else:
+            class_AP[k]=0.0
+        row = row+25
+        cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
     return img
 
 

From a73cef19633d66c7d1a40ff549fce8e749ffe398 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Mon, 11 Jul 2022 23:03:27 +0800
Subject: [PATCH 52/59] Update demo.py

---
 tools/demo.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/tools/demo.py b/tools/demo.py
index 8588b1e31..dc065ca20 100644
--- a/tools/demo.py
+++ b/tools/demo.py
@@ -12,7 +12,7 @@
 import torch
 
 from yolox.data.data_augment import ValTransform
-from yolox.data.datasets import COCO_CLASSES
+from yolox.data.datasets import COCO_CLASSES, VOC_CLASSES
 from yolox.exp import get_exp
 from yolox.utils import fuse_model, get_model_info, postprocess, vis
 
@@ -175,11 +175,23 @@ def visual(self, output, img_info, cls_conf=0.35):
             for i in self.cls_names:
                 class_count[i] = 0
                 class_AP[i] = 0.0
-            line = 0
-            for k in class_count:  
-                cv2.putText(img, str(k)+": "+str(class_count[k]), (15,25+line), font, 0.8, (0, 255, 255), thickness=2)
-                cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (15,50+line), font, 0.8, (0, 255, 255), thickness=2)
-                line = line+50
+            x0 = 15
+            y0 = 0
+            row = 0
+            for k in class_count: 
+                if((y0+row+50)>=img.shape[0]):
+                    x0 = x0+200
+                    y0 = 25
+                    row = 0
+                else:
+                    row = row+25
+                cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+                if class_count[k] !=0:
+                    class_AP[k]=class_AP[k]/class_count[k]
+                else:
+                    class_AP[k]=0.0
+                row = row+25
+                cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
             return img
         output = output.cpu()
 

From fefd852015c53571905778a3645c7f53e7f699f9 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Mon, 11 Jul 2022 23:06:56 +0800
Subject: [PATCH 53/59] Add files via upload

---
 tools/demo_sliding_window.py | 387 +++++++++++++++++++++++++++++++++++
 1 file changed, 387 insertions(+)
 create mode 100644 tools/demo_sliding_window.py

diff --git a/tools/demo_sliding_window.py b/tools/demo_sliding_window.py
new file mode 100644
index 000000000..29846e4aa
--- /dev/null
+++ b/tools/demo_sliding_window.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import argparse
+import os
+import time
+from loguru import logger
+
+import cv2
+
+import torch
+
+from yolox.data.data_augment import ValTransform, sliding_window
+from yolox.data.datasets import COCO_CLASSES, VOC_CLASSES
+from yolox.exp import get_exp
+from yolox.utils import fuse_model, get_model_info, postprocess, vis
+
+IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]
+
+
+def make_parser():
+    parser = argparse.ArgumentParser("YOLOX Demo!")
+    parser.add_argument(
+        "demo", default="image", help="demo type, eg. image, video and webcam"
+    )
+    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
+    parser.add_argument("-n", "--name", type=str, default=None, help="model name")
+
+    parser.add_argument(
+        "--path", default="./assets/dog.jpg", help="path to images or video"
+    )
+    parser.add_argument("--camid", type=int, default=0, help="webcam demo camera id")
+    parser.add_argument(
+        "--save_result",
+        action="store_true",
+        help="whether to save the inference result of image/video",
+    )
+
+    # exp file
+    parser.add_argument(
+        "-f",
+        "--exp_file",
+        default=None,
+        type=str,
+        help="please input your experiment description file",
+    )
+    parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
+    parser.add_argument(
+        "--device",
+        default="cpu",
+        type=str,
+        help="device to run our model, can either be cpu or gpu",
+    )
+    parser.add_argument("--conf", default=0.3, type=float, help="test conf")
+    parser.add_argument("--nms", default=0.3, type=float, help="test nms threshold")
+    parser.add_argument("--tsize", default=None, type=int, help="test img size")
+    parser.add_argument(
+        "--fp16",
+        dest="fp16",
+        default=False,
+        action="store_true",
+        help="Adopting mix precision evaluating.",
+    )
+    parser.add_argument(
+        "--legacy",
+        dest="legacy",
+        default=False,
+        action="store_true",
+        help="To be compatible with older versions",
+    )
+    parser.add_argument(
+        "--fuse",
+        dest="fuse",
+        default=False,
+        action="store_true",
+        help="Fuse conv and bn for testing.",
+    )
+    parser.add_argument(
+        "--trt",
+        dest="trt",
+        default=False,
+        action="store_true",
+        help="Using TensorRT model for testing.",
+    )
+    return parser
+
+
+def get_image_list(path):
+    image_names = []
+    for maindir, subdir, file_name_list in os.walk(path):
+        for filename in file_name_list:
+            apath = os.path.join(maindir, filename)
+            ext = os.path.splitext(apath)[1]
+            if ext in IMAGE_EXT:
+                image_names.append(apath)
+    return image_names
+
+
+class Predictor(object):
+    def __init__(
+        self,
+        model,
+        exp,
+        cls_names=COCO_CLASSES,
+        trt_file=None,
+        decoder=None,
+        device="cpu",
+        fp16=False,
+        legacy=False,
+    ):
+        self.model = model
+        self.cls_names = cls_names
+        self.decoder = decoder
+        self.num_classes = exp.num_classes
+        self.confthre = exp.test_conf
+        self.nmsthre = exp.nmsthre
+        self.test_size = exp.test_size
+        self.device = device
+        self.fp16 = fp16
+        self.preproc = ValTransform(legacy=legacy)
+        if trt_file is not None:
+            from torch2trt import TRTModule
+
+            model_trt = TRTModule()
+            model_trt.load_state_dict(torch.load(trt_file))
+
+            x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
+            self.model(x)
+            self.model = model_trt
+
+    def inference(self, img):
+        img_info = {"id": 0}
+        if isinstance(img, str):
+            img_info["file_name"] = os.path.basename(img)
+            img = cv2.imread(img)
+        else:
+            img_info["file_name"] = None
+
+        height, width = img.shape[:2]
+        img_info["height"] = height
+        img_info["width"] = width
+        img_info["raw_img"] = img
+
+        #ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
+        #img_info["ratio"] = ratio
+        (winW, winH) = (exp.test_size[1], exp.test_size[0])
+        (imgW, imgH)= (img.shape[1],img.shape[0])
+        if (imgH%winH):
+            y_stepSize = winH-(winH*(imgH//winH+1)-imgH)//(imgH//winH)
+            if(imgW%winW):
+                x_stepSize = winW-(winW*(imgW//winW+1)-imgW)//(imgW//winW)
+            else:
+                x_stepSize = winW
+        else:
+            y_stepSize = winH
+            if(imgW%winW):
+                x_stepSize = winW-(winW*(imgW//winW+1)-imgW)//(imgW//winW)
+            else:
+                x_stepSize = winW
+        numW = 0
+        for (x, y, window) in sliding_window(img, YstepSize=y_stepSize, XstepSize=x_stepSize, windowSize=(winW, winH)):
+		    # if the window does not meet our desired window size, ignore it
+            if window.shape[0] != winH or window.shape[1] != winW:
+                continue
+
+            Wimg, _ = self.preproc(window, None, self.test_size)
+            Wimg = torch.from_numpy(Wimg).unsqueeze(0)
+            Wimg = Wimg.float()
+            if self.device == "gpu":
+                Wimg = Wimg.cuda()
+                if self.fp16:
+                    Wimg = Wimg.half()  # to FP16
+
+            with torch.no_grad():
+                t0 = time.time()
+                Woutputs = self.model(Wimg)
+                if numW != 0:
+                    Woutputs[:, :, 0] = torch.add(Woutputs[:, :,0], x)
+                    Woutputs[:, :, 1] = torch.add(Woutputs[:, :,1], y)
+                    outputs = torch.cat((outputs, Woutputs), 1) 
+                else:
+                    outputs = Woutputs
+                    numW=numW+1
+
+        if self.decoder is not None:
+            outputs = self.decoder(outputs, dtype=outputs.type())
+        outputs = postprocess(
+            outputs, self.num_classes, self.confthre,
+            self.nmsthre, class_agnostic=True
+        )
+        
+        if outputs[0] is None:
+            pass
+        elif len(outputs[0]) == 2:
+            li_outputs = []
+            temp = torch.empty(1, 7)
+            temp[0][0] = torch.min(outputs[0][0, 0], outputs[0][1, 0])
+            temp[0][1] = torch.min(outputs[0][0, 1], outputs[0][1, 1])
+            temp[0][2] = torch.max(outputs[0][0, 2], outputs[0][1, 2])
+            temp[0][3] = torch.max(outputs[0][0, 3], outputs[0][1, 3])
+            temp[0][4] = torch.add(outputs[0][0, 4], outputs[0][1, 4]) / 2
+            temp[0][5] = torch.add(outputs[0][0, 5], outputs[0][1, 5]) / 2
+            temp[0][6] = torch.add(outputs[0][0, 6], outputs[0][1, 6]) / 2
+            li_outputs.append(temp)
+            outputs = li_outputs
+        
+
+        logger.info("Infer time: {:.4f}s".format(time.time() - t0))
+        return outputs, img_info
+
+    def visual(self, output, img_info, cls_conf=0.35):
+        #ratio = img_info["ratio"]
+        img = img_info["raw_img"]
+        if output is None:
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            class_count = {}
+            class_AP = {}
+            for i in self.cls_names:
+                class_count[i] = 0
+                class_AP[i] = 0.0
+            x0 = 15
+            y0 = 0
+            row = 0
+            for k in class_count: 
+                if((y0+row+50)>=img.shape[0]):
+                    x0 = x0+200
+                    y0 = 25
+                    row = 0
+                else:
+                    row = row+25
+                cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+                if class_count[k] !=0:
+                    class_AP[k]=class_AP[k]/class_count[k]
+                else:
+                    class_AP[k]=0.0
+                row = row+25
+                cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+            return img
+        output = output.cpu()
+
+        bboxes = output[:, 0:4]
+
+        # preprocessing: resize
+        #bboxes /= ratio
+
+        cls = output[:, 6]
+        scores = output[:, 4] * output[:, 5]
+
+        vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
+        return vis_res
+
+
+def image_demo(predictor, vis_folder, path, current_time, save_result):
+    if os.path.isdir(path):
+        files = get_image_list(path)
+    else:
+        files = [path]
+    files.sort()
+    for image_name in files:
+        outputs, img_info = predictor.inference(image_name)
+        result_image = predictor.visual(outputs[0], img_info, predictor.confthre)
+        if save_result:
+            save_folder = os.path.join(
+                vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+            )
+            os.makedirs(save_folder, exist_ok=True)
+            save_file_name = os.path.join(save_folder, os.path.basename(image_name))
+            logger.info("Saving detection result in {}".format(save_file_name))
+            cv2.imwrite(save_file_name, result_image)
+        ch = cv2.waitKey(0)
+        if ch == 27 or ch == ord("q") or ch == ord("Q"):
+            break
+
+
+def imageflow_demo(predictor, vis_folder, current_time, args):
+    cap = cv2.VideoCapture(args.path if args.demo == "video" else args.camid)
+    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  # float
+    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  # float
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    if args.save_result:
+        save_folder = os.path.join(
+            vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+        )
+        os.makedirs(save_folder, exist_ok=True)
+        if args.demo == "video":
+            save_path = os.path.join(save_folder, os.path.basename(args.path))
+        else:
+            save_path = os.path.join(save_folder, "camera.mp4")
+        logger.info(f"video save_path is {save_path}")
+        vid_writer = cv2.VideoWriter(
+            save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
+        )
+    while True:
+        ret_val, frame = cap.read()
+        if ret_val:
+            outputs, img_info = predictor.inference(frame)
+            result_frame = predictor.visual(outputs[0], img_info, predictor.confthre)
+            if args.save_result:
+                vid_writer.write(result_frame)
+            else:
+                cv2.namedWindow("yolox", cv2.WINDOW_NORMAL)
+                cv2.imshow("yolox", result_frame)
+            ch = cv2.waitKey(1)
+            if ch == 27 or ch == ord("q") or ch == ord("Q"):
+                break
+        else:
+            break
+
+
+def main(exp, args):
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
+    file_name = os.path.join(exp.output_dir, args.experiment_name)
+    os.makedirs(file_name, exist_ok=True)
+
+    vis_folder = None
+    if args.save_result:
+        vis_folder = os.path.join(file_name, "vis_res")
+        os.makedirs(vis_folder, exist_ok=True)
+
+    if args.trt:
+        args.device = "gpu"
+
+    logger.info("Args: {}".format(args))
+
+    if args.conf is not None:
+        exp.test_conf = args.conf
+    if args.nms is not None:
+        exp.nmsthre = args.nms
+    if args.tsize is not None:
+        exp.test_size = (args.tsize, args.tsize)
+
+    model = exp.get_model()
+    logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
+
+    if args.device == "gpu":
+        model.cuda()
+        if args.fp16:
+            model.half()  # to FP16
+    model.eval()
+
+    if not args.trt:
+        if args.ckpt is None:
+            ckpt_file = os.path.join(file_name, "best_ckpt.pth")
+        else:
+            ckpt_file = args.ckpt
+        logger.info("loading checkpoint")
+        ckpt = torch.load(ckpt_file, map_location="cpu")
+        # load the model state dict
+        model.load_state_dict(ckpt["model"])
+        logger.info("loaded checkpoint done.")
+
+    if args.fuse:
+        logger.info("\tFusing model...")
+        model = fuse_model(model)
+
+    if args.trt:
+        assert not args.fuse, "TensorRT model is not support model fusing!"
+        trt_file = os.path.join(file_name, "model_trt.pth")
+        assert os.path.exists(
+            trt_file
+        ), "TensorRT model is not found!\n Run python3 tools/trt.py first!"
+        model.head.decode_in_inference = False
+        decoder = model.head.decode_outputs
+        logger.info("Using TensorRT to inference")
+    else:
+        trt_file = None
+        decoder = None
+
+    predictor = Predictor(
+        model, exp, COCO_CLASSES, trt_file, decoder,
+        args.device, args.fp16, args.legacy,
+    )
+    current_time = time.localtime()
+    if args.demo == "image":
+        image_demo(predictor, vis_folder, args.path, current_time, args.save_result)
+    elif args.demo == "video" or args.demo == "webcam":
+        imageflow_demo(predictor, vis_folder, current_time, args)
+
+
+if __name__ == "__main__":
+    args = make_parser().parse_args()
+    exp = get_exp(args.exp_file, args.name)
+
+    main(exp, args)
\ No newline at end of file

From a3532ca79589e24f048d6d31ebde909f77a555cc Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Tue, 12 Jul 2022 08:08:20 +0800
Subject: [PATCH 54/59] Update visualize.py

---
 yolox/utils/visualize.py | 540 ++++++++++++++++++++++++++++-----------
 1 file changed, 396 insertions(+), 144 deletions(-)

diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py
index 72eba4d65..e733a5dec 100644
--- a/yolox/utils/visualize.py
+++ b/yolox/utils/visualize.py
@@ -1,152 +1,404 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
-# Copyright (c) Megvii Inc. All rights reserved.
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import argparse
+import os
+import time
+from loguru import logger
 
 import cv2
-import numpy as np
-
-__all__ = ["vis"]
-
-
-def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
-    class_count = {}
-    class_AP = {}
-    for j in class_names:
-        class_count[j] = 0
-        class_AP[j] = 0
-
-    for i in range(len(boxes)):
-        box = boxes[i]
-        cls_id = int(cls_ids[i])
-        score = scores[i]
-        if score < conf:
-            continue
-        x0 = int(box[0])
-        y0 = int(box[1])
-        x1 = int(box[2])
-        y1 = int(box[3])
-
-        color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
-        text = '{:.1f}%'.format(score * 100)#'{}:{:.1f}%'.format(class_names[cls_id], score * 100)
-        txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
-        font = cv2.FONT_HERSHEY_SIMPLEX
-
-        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
-        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
-
-        txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
-        cv2.rectangle(
-            img,
-            (x0, y0 + 1),
-            (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
-            txt_bk_color,
-            -1
+
+import torch
+
+from yolox.data.data_augment import ValTransform, sliding_window
+from yolox.data.datasets import COCO_CLASSES,VOC_CLASSES
+from yolox.exp import get_exp
+from yolox.utils import fuse_model, get_model_info, postprocess, vis
+
+IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]
+
+
+def make_parser():
+    parser = argparse.ArgumentParser("YOLOX Demo!")
+    parser.add_argument(
+        "demo", default="image", help="demo type, eg. image, video and webcam"
+    )
+    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
+    parser.add_argument("-n", "--name", type=str, default=None, help="model name")
+
+    parser.add_argument(
+        "--path", default="./assets/dog.jpg", help="path to images or video"
+    )
+    parser.add_argument("--camid", type=int, default=0, help="webcam demo camera id")
+    parser.add_argument(
+        "--save_result",
+        action="store_true",
+        help="whether to save the inference result of image/video",
+    )
+
+    # exp file
+    parser.add_argument(
+        "-f",
+        "--exp_file",
+        default=None,
+        type=str,
+        help="please input your experiment description file",
+    )
+    parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
+    parser.add_argument(
+        "--device",
+        default="cpu",
+        type=str,
+        help="device to run our model, can either be cpu or gpu",
+    )
+    parser.add_argument("--conf", default=0.3, type=float, help="test conf")
+    parser.add_argument("--nms", default=0.3, type=float, help="test nms threshold")
+    parser.add_argument("--tsize", default=None, type=int, help="test img size")
+    parser.add_argument(
+        "--fp16",
+        dest="fp16",
+        default=False,
+        action="store_true",
+        help="Adopting mix precision evaluating.",
+    )
+    parser.add_argument(
+        "--legacy",
+        dest="legacy",
+        default=False,
+        action="store_true",
+        help="To be compatible with older versions",
+    )
+    parser.add_argument(
+        "--fuse",
+        dest="fuse",
+        default=False,
+        action="store_true",
+        help="Fuse conv and bn for testing.",
+    )
+    parser.add_argument(
+        "--trt",
+        dest="trt",
+        default=False,
+        action="store_true",
+        help="Using TensorRT model for testing.",
+    )
+    return parser
+
+
+def get_image_list(path):
+    image_names = []
+    for maindir, subdir, file_name_list in os.walk(path):
+        for filename in file_name_list:
+            apath = os.path.join(maindir, filename)
+            ext = os.path.splitext(apath)[1]
+            if ext in IMAGE_EXT:
+                image_names.append(apath)
+    return image_names
+
+
+class Predictor(object):
+    def __init__(
+        self,
+        model,
+        exp,
+        cls_names=COCO_CLASSES,
+        trt_file=None,
+        decoder=None,
+        device="cpu",
+        fp16=False,
+        legacy=False,
+    ):
+        self.model = model
+        self.cls_names = cls_names
+        self.decoder = decoder
+        self.num_classes = exp.num_classes
+        self.confthre = exp.test_conf
+        self.nmsthre = exp.nmsthre
+        self.test_size = exp.test_size
+        self.device = device
+        self.fp16 = fp16
+        self.preproc = ValTransform(legacy=legacy)
+        if trt_file is not None:
+            from torch2trt import TRTModule
+
+            model_trt = TRTModule()
+            model_trt.load_state_dict(torch.load(trt_file))
+
+            x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
+            self.model(x)
+            self.model = model_trt
+
+    def inference(self, img):
+        img_info = {"id": 0}
+        if isinstance(img, str):
+            img_info["file_name"] = os.path.basename(img)
+            img = cv2.imread(img)
+        else:
+            img_info["file_name"] = None
+
+        height, width = img.shape[:2]
+        img_info["height"] = height
+        img_info["width"] = width
+        img_info["raw_img"] = img
+
+        #ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
+        #img_info["ratio"] = ratio
+        # initial
+        (imgW, imgH)= (img.shape[1],img.shape[0])
+        (winW, winH) = (exp.test_size[1], exp.test_size[0])
+        # deciding window size
+        if (imgH<winH):
+            if (imgW<winW):
+                (winW, winH)= (imgW,imgH)
+            else:
+                (winW, winH)= (winW,imgH)
+        else:
+            if (imgW<winW):
+                (winW, winH)= (imgW,winH)
+            else:
+                (winW, winH)= (winW, winH)
+        # deciding stepsize size
+        if (imgH%winH):
+            y_stepSize = winH-(winH*(imgH//winH+1)-imgH)//(imgH//winH)
+            if(imgW%winW):
+                x_stepSize = winW-(winW*(imgW//winW+1)-imgW)//(imgW//winW)
+            else:
+                x_stepSize = winW
+        else:
+            y_stepSize = winH
+            if(imgW%winW):
+                x_stepSize = winW-(winW*(imgW//winW+1)-imgW)//(imgW//winW)
+            else:
+                x_stepSize = winW
+        # sliding window
+        numW = 0
+        for (x, y, window) in sliding_window(img, YstepSize=y_stepSize, XstepSize=x_stepSize, windowSize=(winW, winH)):
+		    # if the window does not meet our desired window size, ignore it
+            if window.shape[0] != winH or window.shape[1] != winW:
+                continue
+
+            Wimg, _ = self.preproc(window, None, self.test_size)
+            Wimg = torch.from_numpy(Wimg).unsqueeze(0)
+            Wimg = Wimg.float()
+            if self.device == "gpu":
+                Wimg = Wimg.cuda()
+                if self.fp16:
+                    Wimg = Wimg.half()  # to FP16
+
+            with torch.no_grad():
+                t0 = time.time()
+                Woutputs = self.model(Wimg)
+                # fix bounding box location
+                if numW != 0:
+                    Woutputs[:, :, 0] = torch.add(Woutputs[:, :,0], x)
+                    Woutputs[:, :, 1] = torch.add(Woutputs[:, :,1], y)
+                    outputs = torch.cat((outputs, Woutputs), 1) 
+                else:
+                    outputs = Woutputs
+                    numW=numW+1
+
+        if self.decoder is not None:
+            outputs = self.decoder(outputs, dtype=outputs.type())
+        outputs = postprocess(
+            outputs, self.num_classes, self.confthre,
+            self.nmsthre, class_agnostic=True
         )
-        class_count[class_names[cls_id]] = class_count[class_names[cls_id]]+1
-        class_AP[class_names[cls_id]] = class_AP[class_names[cls_id]]+float('{:.1f}'.format(score * 100))
-        cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
-        
-    x0 = 15
-    y0 = 0
-    row = 0
-    for k in class_count: 
-        if((y0+row+50)>=img.shape[0]):
-            x0 = x0+200
-            y0 = 25
+        logger.info(outputs)
+        """
+        # 
+        if outputs[0] is None:
+            pass
+        elif len(outputs[0]) == 2:
+            li_outputs = []
+            temp = torch.empty(1, 7)
+            temp[0][0] = torch.min(outputs[0][0, 0], outputs[0][1, 0])
+            temp[0][1] = torch.min(outputs[0][0, 1], outputs[0][1, 1])
+            temp[0][2] = torch.max(outputs[0][0, 2], outputs[0][1, 2])
+            temp[0][3] = torch.max(outputs[0][0, 3], outputs[0][1, 3])
+            temp[0][4] = torch.add(outputs[0][0, 4], outputs[0][1, 4]) / 2
+            temp[0][5] = torch.add(outputs[0][0, 5], outputs[0][1, 5]) / 2
+            temp[0][6] = torch.add(outputs[0][0, 6], outputs[0][1, 6]) / 2
+            li_outputs.append(temp)
+            outputs = li_outputs
+        """
+
+        logger.info("Infer time: {:.4f}s".format(time.time() - t0))
+        return outputs, img_info
+
+    def visual(self, output, img_info, cls_conf=0.35):
+        #ratio = img_info["ratio"]
+        img = img_info["raw_img"]
+        if output is None:
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            class_count = {}
+            class_AP = {}
+            for i in self.cls_names:
+                class_count[i] = 0
+                class_AP[i] = 0.0
+            x0 = 15
+            y0 = 0
             row = 0
+            for k in class_count: 
+                if((y0+row+50)>=img.shape[0]):
+                    x0 = x0+200
+                    y0 = 25
+                    row = 0
+                else:
+                    row = row+25
+                cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+                if class_count[k] !=0:
+                    class_AP[k]=class_AP[k]/class_count[k]
+                else:
+                    class_AP[k]=0.0
+                row = row+25
+                cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+            return img
+        output = output.cpu()
+
+        bboxes = output[:, 0:4]
+
+        # preprocessing: resize
+        #bboxes /= ratio
+
+        cls = output[:, 6]
+        scores = output[:, 4] * output[:, 5]
+
+        vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
+        return vis_res
+
+
+def image_demo(predictor, vis_folder, path, current_time, save_result):
+    if os.path.isdir(path):
+        files = get_image_list(path)
+    else:
+        files = [path]
+    files.sort()
+    for image_name in files:
+        outputs, img_info = predictor.inference(image_name)
+        result_image = predictor.visual(outputs[0], img_info, predictor.confthre)
+        if save_result:
+            save_folder = os.path.join(
+                vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+            )
+            os.makedirs(save_folder, exist_ok=True)
+            save_file_name = os.path.join(save_folder, os.path.basename(image_name))
+            logger.info("Saving detection result in {}".format(save_file_name))
+            cv2.imwrite(save_file_name, result_image)
+        ch = cv2.waitKey(0)
+        if ch == 27 or ch == ord("q") or ch == ord("Q"):
+            break
+
+
+def imageflow_demo(predictor, vis_folder, current_time, args):
+    cap = cv2.VideoCapture(args.path if args.demo == "video" else args.camid)
+    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  # float
+    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  # float
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    if args.save_result:
+        save_folder = os.path.join(
+            vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+        )
+        os.makedirs(save_folder, exist_ok=True)
+        if args.demo == "video":
+            save_path = os.path.join(save_folder, os.path.basename(args.path))
+        else:
+            save_path = os.path.join(save_folder, "camera.mp4")
+        logger.info(f"video save_path is {save_path}")
+        vid_writer = cv2.VideoWriter(
+            save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
+        )
+    while True:
+        ret_val, frame = cap.read()
+        if ret_val:
+            outputs, img_info = predictor.inference(frame)
+            result_frame = predictor.visual(outputs[0], img_info, predictor.confthre)
+            if args.save_result:
+                vid_writer.write(result_frame)
+            else:
+                cv2.namedWindow("yolox", cv2.WINDOW_NORMAL)
+                cv2.imshow("yolox", result_frame)
+            ch = cv2.waitKey(1)
+            if ch == 27 or ch == ord("q") or ch == ord("Q"):
+                break
         else:
-            row = row+25
-        cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
-        if class_count[k] !=0:
-            class_AP[k]=class_AP[k]/class_count[k]
+            break
+
+
+def main(exp, args):
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
+    file_name = os.path.join(exp.output_dir, args.experiment_name)
+    os.makedirs(file_name, exist_ok=True)
+
+    vis_folder = None
+    if args.save_result:
+        vis_folder = os.path.join(file_name, "vis_res")
+        os.makedirs(vis_folder, exist_ok=True)
+
+    if args.trt:
+        args.device = "gpu"
+
+    logger.info("Args: {}".format(args))
+
+    if args.conf is not None:
+        exp.test_conf = args.conf
+    if args.nms is not None:
+        exp.nmsthre = args.nms
+    if args.tsize is not None:
+        exp.test_size = (args.tsize, args.tsize)
+
+    model = exp.get_model()
+    logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
+
+    if args.device == "gpu":
+        model.cuda()
+        if args.fp16:
+            model.half()  # to FP16
+    model.eval()
+
+    if not args.trt:
+        if args.ckpt is None:
+            ckpt_file = os.path.join(file_name, "best_ckpt.pth")
         else:
-            class_AP[k]=0.0
-        row = row+25
-        cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
-    return img
-
-
-_COLORS = np.array(
-    [
-        0.000, 0.447, 0.741,
-        0.850, 0.325, 0.098,
-        0.929, 0.694, 0.125,
-        0.494, 0.184, 0.556,
-        0.466, 0.674, 0.188,
-        0.301, 0.745, 0.933,
-        0.635, 0.078, 0.184,
-        0.300, 0.300, 0.300,
-        0.600, 0.600, 0.600,
-        1.000, 0.000, 0.000,
-        1.000, 0.500, 0.000,
-        0.749, 0.749, 0.000,
-        0.000, 1.000, 0.000,
-        0.000, 0.000, 1.000,
-        0.667, 0.000, 1.000,
-        0.333, 0.333, 0.000,
-        0.333, 0.667, 0.000,
-        0.333, 1.000, 0.000,
-        0.667, 0.333, 0.000,
-        0.667, 0.667, 0.000,
-        0.667, 1.000, 0.000,
-        1.000, 0.333, 0.000,
-        1.000, 0.667, 0.000,
-        1.000, 1.000, 0.000,
-        0.000, 0.333, 0.500,
-        0.000, 0.667, 0.500,
-        0.000, 1.000, 0.500,
-        0.333, 0.000, 0.500,
-        0.333, 0.333, 0.500,
-        0.333, 0.667, 0.500,
-        0.333, 1.000, 0.500,
-        0.667, 0.000, 0.500,
-        0.667, 0.333, 0.500,
-        0.667, 0.667, 0.500,
-        0.667, 1.000, 0.500,
-        1.000, 0.000, 0.500,
-        1.000, 0.333, 0.500,
-        1.000, 0.667, 0.500,
-        1.000, 1.000, 0.500,
-        0.000, 0.333, 1.000,
-        0.000, 0.667, 1.000,
-        0.000, 1.000, 1.000,
-        0.333, 0.000, 1.000,
-        0.333, 0.333, 1.000,
-        0.333, 0.667, 1.000,
-        0.333, 1.000, 1.000,
-        0.667, 0.000, 1.000,
-        0.667, 0.333, 1.000,
-        0.667, 0.667, 1.000,
-        0.667, 1.000, 1.000,
-        1.000, 0.000, 1.000,
-        1.000, 0.333, 1.000,
-        1.000, 0.667, 1.000,
-        0.333, 0.000, 0.000,
-        0.500, 0.000, 0.000,
-        0.667, 0.000, 0.000,
-        0.833, 0.000, 0.000,
-        1.000, 0.000, 0.000,
-        0.000, 0.167, 0.000,
-        0.000, 0.333, 0.000,
-        0.000, 0.500, 0.000,
-        0.000, 0.667, 0.000,
-        0.000, 0.833, 0.000,
-        0.000, 1.000, 0.000,
-        0.000, 0.000, 0.167,
-        0.000, 0.000, 0.333,
-        0.000, 0.000, 0.500,
-        0.000, 0.000, 0.667,
-        0.000, 0.000, 0.833,
-        0.000, 0.000, 1.000,
-        0.000, 0.000, 0.000,
-        0.143, 0.143, 0.143,
-        0.286, 0.286, 0.286,
-        0.429, 0.429, 0.429,
-        0.571, 0.571, 0.571,
-        0.714, 0.714, 0.714,
-        0.857, 0.857, 0.857,
-        0.000, 0.447, 0.741,
-        0.314, 0.717, 0.741,
-        0.50, 0.5, 0
-    ]
-).astype(np.float32).reshape(-1, 3)
+            ckpt_file = args.ckpt
+        logger.info("loading checkpoint")
+        ckpt = torch.load(ckpt_file, map_location="cpu")
+        # load the model state dict
+        model.load_state_dict(ckpt["model"])
+        logger.info("loaded checkpoint done.")
+
+    if args.fuse:
+        logger.info("\tFusing model...")
+        model = fuse_model(model)
+
+    if args.trt:
+        assert not args.fuse, "TensorRT model is not support model fusing!"
+        trt_file = os.path.join(file_name, "model_trt.pth")
+        assert os.path.exists(
+            trt_file
+        ), "TensorRT model is not found!\n Run python3 tools/trt.py first!"
+        model.head.decode_in_inference = False
+        decoder = model.head.decode_outputs
+        logger.info("Using TensorRT to inference")
+    else:
+        trt_file = None
+        decoder = None
+
+    predictor = Predictor(
+        model, exp, COCO_CLASSES, trt_file, decoder,
+        args.device, args.fp16, args.legacy,
+    )
+    current_time = time.localtime()
+    if args.demo == "image":
+        image_demo(predictor, vis_folder, args.path, current_time, args.save_result)
+    elif args.demo == "video" or args.demo == "webcam":
+        imageflow_demo(predictor, vis_folder, current_time, args)
+
+
+if __name__ == "__main__":
+    args = make_parser().parse_args()
+    exp = get_exp(args.exp_file, args.name)
+
+    main(exp, args)

From e261097aa533dbf0c1959ac24c7923c2be6a5127 Mon Sep 17 00:00:00 2001
From: lujulia <39236354+lujulia@users.noreply.github.com>
Date: Fri, 15 Jul 2022 17:23:53 +0800
Subject: [PATCH 55/59] Update visualize.py

---
 yolox/utils/visualize.py | 541 +++++++++++----------------------------
 1 file changed, 145 insertions(+), 396 deletions(-)

diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py
index e733a5dec..16aa9dee5 100644
--- a/yolox/utils/visualize.py
+++ b/yolox/utils/visualize.py
@@ -1,404 +1,153 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
-# Copyright (c) Megvii, Inc. and its affiliates.
-
-import argparse
-import os
-import time
-from loguru import logger
+# Copyright (c) Megvii Inc. All rights reserved.
 
 import cv2
-
-import torch
-
-from yolox.data.data_augment import ValTransform, sliding_window
-from yolox.data.datasets import COCO_CLASSES,VOC_CLASSES
-from yolox.exp import get_exp
-from yolox.utils import fuse_model, get_model_info, postprocess, vis
-
-IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]
-
-
-def make_parser():
-    parser = argparse.ArgumentParser("YOLOX Demo!")
-    parser.add_argument(
-        "demo", default="image", help="demo type, eg. image, video and webcam"
-    )
-    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
-    parser.add_argument("-n", "--name", type=str, default=None, help="model name")
-
-    parser.add_argument(
-        "--path", default="./assets/dog.jpg", help="path to images or video"
-    )
-    parser.add_argument("--camid", type=int, default=0, help="webcam demo camera id")
-    parser.add_argument(
-        "--save_result",
-        action="store_true",
-        help="whether to save the inference result of image/video",
-    )
-
-    # exp file
-    parser.add_argument(
-        "-f",
-        "--exp_file",
-        default=None,
-        type=str,
-        help="please input your experiment description file",
-    )
-    parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
-    parser.add_argument(
-        "--device",
-        default="cpu",
-        type=str,
-        help="device to run our model, can either be cpu or gpu",
-    )
-    parser.add_argument("--conf", default=0.3, type=float, help="test conf")
-    parser.add_argument("--nms", default=0.3, type=float, help="test nms threshold")
-    parser.add_argument("--tsize", default=None, type=int, help="test img size")
-    parser.add_argument(
-        "--fp16",
-        dest="fp16",
-        default=False,
-        action="store_true",
-        help="Adopting mix precision evaluating.",
-    )
-    parser.add_argument(
-        "--legacy",
-        dest="legacy",
-        default=False,
-        action="store_true",
-        help="To be compatible with older versions",
-    )
-    parser.add_argument(
-        "--fuse",
-        dest="fuse",
-        default=False,
-        action="store_true",
-        help="Fuse conv and bn for testing.",
-    )
-    parser.add_argument(
-        "--trt",
-        dest="trt",
-        default=False,
-        action="store_true",
-        help="Using TensorRT model for testing.",
-    )
-    return parser
-
-
-def get_image_list(path):
-    image_names = []
-    for maindir, subdir, file_name_list in os.walk(path):
-        for filename in file_name_list:
-            apath = os.path.join(maindir, filename)
-            ext = os.path.splitext(apath)[1]
-            if ext in IMAGE_EXT:
-                image_names.append(apath)
-    return image_names
-
-
-class Predictor(object):
-    def __init__(
-        self,
-        model,
-        exp,
-        cls_names=COCO_CLASSES,
-        trt_file=None,
-        decoder=None,
-        device="cpu",
-        fp16=False,
-        legacy=False,
-    ):
-        self.model = model
-        self.cls_names = cls_names
-        self.decoder = decoder
-        self.num_classes = exp.num_classes
-        self.confthre = exp.test_conf
-        self.nmsthre = exp.nmsthre
-        self.test_size = exp.test_size
-        self.device = device
-        self.fp16 = fp16
-        self.preproc = ValTransform(legacy=legacy)
-        if trt_file is not None:
-            from torch2trt import TRTModule
-
-            model_trt = TRTModule()
-            model_trt.load_state_dict(torch.load(trt_file))
-
-            x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
-            self.model(x)
-            self.model = model_trt
-
-    def inference(self, img):
-        img_info = {"id": 0}
-        if isinstance(img, str):
-            img_info["file_name"] = os.path.basename(img)
-            img = cv2.imread(img)
-        else:
-            img_info["file_name"] = None
-
-        height, width = img.shape[:2]
-        img_info["height"] = height
-        img_info["width"] = width
-        img_info["raw_img"] = img
-
-        #ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
-        #img_info["ratio"] = ratio
-        # initial
-        (imgW, imgH)= (img.shape[1],img.shape[0])
-        (winW, winH) = (exp.test_size[1], exp.test_size[0])
-        # deciding window size
-        if (imgH<winH):
-            if (imgW<winW):
-                (winW, winH)= (imgW,imgH)
-            else:
-                (winW, winH)= (winW,imgH)
-        else:
-            if (imgW<winW):
-                (winW, winH)= (imgW,winH)
-            else:
-                (winW, winH)= (winW, winH)
-        # deciding stepsize size
-        if (imgH%winH):
-            y_stepSize = winH-(winH*(imgH//winH+1)-imgH)//(imgH//winH)
-            if(imgW%winW):
-                x_stepSize = winW-(winW*(imgW//winW+1)-imgW)//(imgW//winW)
-            else:
-                x_stepSize = winW
-        else:
-            y_stepSize = winH
-            if(imgW%winW):
-                x_stepSize = winW-(winW*(imgW//winW+1)-imgW)//(imgW//winW)
-            else:
-                x_stepSize = winW
-        # sliding window
-        numW = 0
-        for (x, y, window) in sliding_window(img, YstepSize=y_stepSize, XstepSize=x_stepSize, windowSize=(winW, winH)):
-		    # if the window does not meet our desired window size, ignore it
-            if window.shape[0] != winH or window.shape[1] != winW:
-                continue
-
-            Wimg, _ = self.preproc(window, None, self.test_size)
-            Wimg = torch.from_numpy(Wimg).unsqueeze(0)
-            Wimg = Wimg.float()
-            if self.device == "gpu":
-                Wimg = Wimg.cuda()
-                if self.fp16:
-                    Wimg = Wimg.half()  # to FP16
-
-            with torch.no_grad():
-                t0 = time.time()
-                Woutputs = self.model(Wimg)
-                # fix bounding box location
-                if numW != 0:
-                    Woutputs[:, :, 0] = torch.add(Woutputs[:, :,0], x)
-                    Woutputs[:, :, 1] = torch.add(Woutputs[:, :,1], y)
-                    outputs = torch.cat((outputs, Woutputs), 1) 
-                else:
-                    outputs = Woutputs
-                    numW=numW+1
-
-        if self.decoder is not None:
-            outputs = self.decoder(outputs, dtype=outputs.type())
-        outputs = postprocess(
-            outputs, self.num_classes, self.confthre,
-            self.nmsthre, class_agnostic=True
+import numpy as np
+
+__all__ = ["vis"]
+
+
+def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
+    class_count = {}
+    class_AP = {}
+    for j in class_names:
+        class_count[j] = 0
+        class_AP[j] = 0
+
+    for i in range(len(boxes)):
+        box = boxes[i]
+        cls_id = int(cls_ids[i])
+        score = scores[i]
+        if score < conf:
+            continue
+        x0 = int(box[0])
+        y0 = int(box[1])
+        x1 = int(box[2])
+        y1 = int(box[3])
+
+        color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
+        text = '{:.1f}%'.format(score * 100)#'{}:{:.1f}%'.format(class_names[cls_id], score * 100)
+        txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+
+        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
+        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
+
+        txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
+        cv2.rectangle(
+            img,
+            (x0, y0 + 1),
+            (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
+            txt_bk_color,
+            -1
         )
-        logger.info(outputs)
-        """
-        # 
-        if outputs[0] is None:
-            pass
-        elif len(outputs[0]) == 2:
-            li_outputs = []
-            temp = torch.empty(1, 7)
-            temp[0][0] = torch.min(outputs[0][0, 0], outputs[0][1, 0])
-            temp[0][1] = torch.min(outputs[0][0, 1], outputs[0][1, 1])
-            temp[0][2] = torch.max(outputs[0][0, 2], outputs[0][1, 2])
-            temp[0][3] = torch.max(outputs[0][0, 3], outputs[0][1, 3])
-            temp[0][4] = torch.add(outputs[0][0, 4], outputs[0][1, 4]) / 2
-            temp[0][5] = torch.add(outputs[0][0, 5], outputs[0][1, 5]) / 2
-            temp[0][6] = torch.add(outputs[0][0, 6], outputs[0][1, 6]) / 2
-            li_outputs.append(temp)
-            outputs = li_outputs
-        """
-
-        logger.info("Infer time: {:.4f}s".format(time.time() - t0))
-        return outputs, img_info
-
-    def visual(self, output, img_info, cls_conf=0.35):
-        #ratio = img_info["ratio"]
-        img = img_info["raw_img"]
-        if output is None:
-            font = cv2.FONT_HERSHEY_SIMPLEX
-            class_count = {}
-            class_AP = {}
-            for i in self.cls_names:
-                class_count[i] = 0
-                class_AP[i] = 0.0
-            x0 = 15
-            y0 = 0
+        class_count[class_names[cls_id]] = class_count[class_names[cls_id]]+1
+        class_AP[class_names[cls_id]] = class_AP[class_names[cls_id]]+float('{:.1f}'.format(score * 100))
+        cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
+       
+    x0 = 15
+    y0 = 0
+    row = 0
+    for k in class_count: 
+        if((y0+row+50)>=img.shape[0]):
+            x0 = x0+200
+            y0 = 25
             row = 0
-            for k in class_count: 
-                if((y0+row+50)>=img.shape[0]):
-                    x0 = x0+200
-                    y0 = 25
-                    row = 0
-                else:
-                    row = row+25
-                cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
-                if class_count[k] !=0:
-                    class_AP[k]=class_AP[k]/class_count[k]
-                else:
-                    class_AP[k]=0.0
-                row = row+25
-                cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
-            return img
-        output = output.cpu()
-
-        bboxes = output[:, 0:4]
-
-        # preprocessing: resize
-        #bboxes /= ratio
-
-        cls = output[:, 6]
-        scores = output[:, 4] * output[:, 5]
-
-        vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
-        return vis_res
-
-
-def image_demo(predictor, vis_folder, path, current_time, save_result):
-    if os.path.isdir(path):
-        files = get_image_list(path)
-    else:
-        files = [path]
-    files.sort()
-    for image_name in files:
-        outputs, img_info = predictor.inference(image_name)
-        result_image = predictor.visual(outputs[0], img_info, predictor.confthre)
-        if save_result:
-            save_folder = os.path.join(
-                vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
-            )
-            os.makedirs(save_folder, exist_ok=True)
-            save_file_name = os.path.join(save_folder, os.path.basename(image_name))
-            logger.info("Saving detection result in {}".format(save_file_name))
-            cv2.imwrite(save_file_name, result_image)
-        ch = cv2.waitKey(0)
-        if ch == 27 or ch == ord("q") or ch == ord("Q"):
-            break
-
-
-def imageflow_demo(predictor, vis_folder, current_time, args):
-    cap = cv2.VideoCapture(args.path if args.demo == "video" else args.camid)
-    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  # float
-    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  # float
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    if args.save_result:
-        save_folder = os.path.join(
-            vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
-        )
-        os.makedirs(save_folder, exist_ok=True)
-        if args.demo == "video":
-            save_path = os.path.join(save_folder, os.path.basename(args.path))
-        else:
-            save_path = os.path.join(save_folder, "camera.mp4")
-        logger.info(f"video save_path is {save_path}")
-        vid_writer = cv2.VideoWriter(
-            save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
-        )
-    while True:
-        ret_val, frame = cap.read()
-        if ret_val:
-            outputs, img_info = predictor.inference(frame)
-            result_frame = predictor.visual(outputs[0], img_info, predictor.confthre)
-            if args.save_result:
-                vid_writer.write(result_frame)
-            else:
-                cv2.namedWindow("yolox", cv2.WINDOW_NORMAL)
-                cv2.imshow("yolox", result_frame)
-            ch = cv2.waitKey(1)
-            if ch == 27 or ch == ord("q") or ch == ord("Q"):
-                break
         else:
-            break
-
-
-def main(exp, args):
-    if not args.experiment_name:
-        args.experiment_name = exp.exp_name
-
-    file_name = os.path.join(exp.output_dir, args.experiment_name)
-    os.makedirs(file_name, exist_ok=True)
-
-    vis_folder = None
-    if args.save_result:
-        vis_folder = os.path.join(file_name, "vis_res")
-        os.makedirs(vis_folder, exist_ok=True)
-
-    if args.trt:
-        args.device = "gpu"
-
-    logger.info("Args: {}".format(args))
-
-    if args.conf is not None:
-        exp.test_conf = args.conf
-    if args.nms is not None:
-        exp.nmsthre = args.nms
-    if args.tsize is not None:
-        exp.test_size = (args.tsize, args.tsize)
-
-    model = exp.get_model()
-    logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
-
-    if args.device == "gpu":
-        model.cuda()
-        if args.fp16:
-            model.half()  # to FP16
-    model.eval()
-
-    if not args.trt:
-        if args.ckpt is None:
-            ckpt_file = os.path.join(file_name, "best_ckpt.pth")
+            row = row+25
+        cv2.putText(img, str(k)+": "+str(class_count[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+        if class_count[k] !=0:
+            class_AP[k]=class_AP[k]/class_count[k]
         else:
-            ckpt_file = args.ckpt
-        logger.info("loading checkpoint")
-        ckpt = torch.load(ckpt_file, map_location="cpu")
-        # load the model state dict
-        model.load_state_dict(ckpt["model"])
-        logger.info("loaded checkpoint done.")
-
-    if args.fuse:
-        logger.info("\tFusing model...")
-        model = fuse_model(model)
-
-    if args.trt:
-        assert not args.fuse, "TensorRT model is not support model fusing!"
-        trt_file = os.path.join(file_name, "model_trt.pth")
-        assert os.path.exists(
-            trt_file
-        ), "TensorRT model is not found!\n Run python3 tools/trt.py first!"
-        model.head.decode_in_inference = False
-        decoder = model.head.decode_outputs
-        logger.info("Using TensorRT to inference")
-    else:
-        trt_file = None
-        decoder = None
-
-    predictor = Predictor(
-        model, exp, COCO_CLASSES, trt_file, decoder,
-        args.device, args.fp16, args.legacy,
-    )
-    current_time = time.localtime()
-    if args.demo == "image":
-        image_demo(predictor, vis_folder, args.path, current_time, args.save_result)
-    elif args.demo == "video" or args.demo == "webcam":
-        imageflow_demo(predictor, vis_folder, current_time, args)
-
-
-if __name__ == "__main__":
-    args = make_parser().parse_args()
-    exp = get_exp(args.exp_file, args.name)
-
-    main(exp, args)
+            class_AP[k]=0.0
+        row = row+25
+        cv2.putText(img, "AP"+": "+'{:.1f}%'.format(class_AP[k]), (x0,y0+row), font, 0.8, (0, 255, 255), thickness=2)
+    
+    return img
+
+
+_COLORS = np.array(
+    [
+        0.000, 0.447, 0.741,
+        0.850, 0.325, 0.098,
+        0.929, 0.694, 0.125,
+        0.494, 0.184, 0.556,
+        0.466, 0.674, 0.188,
+        0.301, 0.745, 0.933,
+        0.635, 0.078, 0.184,
+        0.300, 0.300, 0.300,
+        0.600, 0.600, 0.600,
+        1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000,
+        0.749, 0.749, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.333, 0.333, 0.000,
+        0.333, 0.667, 0.000,
+        0.333, 1.000, 0.000,
+        0.667, 0.333, 0.000,
+        0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000,
+        1.000, 0.333, 0.000,
+        1.000, 0.667, 0.000,
+        1.000, 1.000, 0.000,
+        0.000, 0.333, 0.500,
+        0.000, 0.667, 0.500,
+        0.000, 1.000, 0.500,
+        0.333, 0.000, 0.500,
+        0.333, 0.333, 0.500,
+        0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500,
+        0.667, 0.000, 0.500,
+        0.667, 0.333, 0.500,
+        0.667, 0.667, 0.500,
+        0.667, 1.000, 0.500,
+        1.000, 0.000, 0.500,
+        1.000, 0.333, 0.500,
+        1.000, 0.667, 0.500,
+        1.000, 1.000, 0.500,
+        0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000,
+        0.000, 1.000, 1.000,
+        0.333, 0.000, 1.000,
+        0.333, 0.333, 1.000,
+        0.333, 0.667, 1.000,
+        0.333, 1.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.667, 0.333, 1.000,
+        0.667, 0.667, 1.000,
+        0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000,
+        1.000, 0.333, 1.000,
+        1.000, 0.667, 1.000,
+        0.333, 0.000, 0.000,
+        0.500, 0.000, 0.000,
+        0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000,
+        1.000, 0.000, 0.000,
+        0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000,
+        0.000, 0.500, 0.000,
+        0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 0.167,
+        0.000, 0.000, 0.333,
+        0.000, 0.000, 0.500,
+        0.000, 0.000, 0.667,
+        0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000,
+        0.000, 0.000, 0.000,
+        0.143, 0.143, 0.143,
+        0.286, 0.286, 0.286,
+        0.429, 0.429, 0.429,
+        0.571, 0.571, 0.571,
+        0.714, 0.714, 0.714,
+        0.857, 0.857, 0.857,
+        0.000, 0.447, 0.741,
+        0.314, 0.717, 0.741,
+        0.50, 0.5, 0
+    ]
+).astype(np.float32).reshape(-1, 3)

From cf8685e46cf2465b01811b22c5f31e0998d096a2 Mon Sep 17 00:00:00 2001
From: "I-CHEN,LU" <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Sep 2023 09:27:52 +0800
Subject: [PATCH 56/59] Update README.md

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9f64852ca..7d78808a2 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,9 @@
 <div align="center"><img src="assets/logo.png" width="350"></div>
 <img src="assets/demo.png" >
 
+<div align="center"><img src="assets/logo.png" width="350"></div>
+<img src="assets/demo.png" >
+
 ## Introduction
 YOLOX is an anchor-free version of YOLO, with a simpler design but better performance! It aims to bridge the gap between research and industrial communities.
 For more details, please refer to our [report on Arxiv](https://arxiv.org/abs/2107.08430).
@@ -246,4 +249,4 @@ It is hoped that every AI practitioner in the world will stick to the concept of
 <div align="center"><img src="assets/sunjian.png" width="200"></div>
 没有孙剑博士的指导，YOLOX也不会问世并开源给社区使用。
 孙剑博士的离去是CV领域的一大损失，我们在此特别添加了这个部分来表达对我们的“船长”孙老师的纪念和哀思。
-希望世界上的每个AI从业者秉持着“持续创新拓展认知边界，非凡科技成就产品价值”的观念，一路向前。
\ No newline at end of file
+希望世界上的每个AI从业者秉持着“持续创新拓展认知边界，非凡科技成就产品价值”的观念，一路向前。

From ff2a91c73bd9d16df300c6dac5b3f1441ebbbd7a Mon Sep 17 00:00:00 2001
From: "I-CHEN,LU" <39236354+lujulia@users.noreply.github.com>
Date: Sat, 9 Sep 2023 09:28:10 +0800
Subject: [PATCH 57/59] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7d78808a2..9cd3e3272 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+
 <div align="center"><img src="assets/logo.png" width="350"></div>
 <img src="assets/demo.png" >
 

From 8fb0fbe3dc75aa9b90ddcb664232c48a659947f5 Mon Sep 17 00:00:00 2001
From: "I-CHEN,LU" <39236354+lujulia@users.noreply.github.com>
Date: Wed, 27 Sep 2023 22:36:00 +0800
Subject: [PATCH 58/59] Update demo_sliding_window.py

---
 tools/demo_sliding_window.py | 57 +++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/tools/demo_sliding_window.py b/tools/demo_sliding_window.py
index 29846e4aa..7a0e20f00 100644
--- a/tools/demo_sliding_window.py
+++ b/tools/demo_sliding_window.py
@@ -11,7 +11,7 @@
 
 import torch
 
-from yolox.data.data_augment import ValTransform, sliding_window
+from yolox.data.data_augment import preproc, sliding_window
 from yolox.data.datasets import COCO_CLASSES, VOC_CLASSES
 from yolox.exp import get_exp
 from yolox.utils import fuse_model, get_model_info, postprocess, vis
@@ -62,6 +62,7 @@ def make_parser():
         action="store_true",
         help="Adopting mix precision evaluating.",
     )
+    """
     parser.add_argument(
         "--legacy",
         dest="legacy",
@@ -69,6 +70,7 @@ def make_parser():
         action="store_true",
         help="To be compatible with older versions",
     )
+    """
     parser.add_argument(
         "--fuse",
         dest="fuse",
@@ -106,8 +108,8 @@ def __init__(
         trt_file=None,
         decoder=None,
         device="cpu",
-        fp16=False,
-        legacy=False,
+        #fp16=False,
+        #legacy=False,
     ):
         self.model = model
         self.cls_names = cls_names
@@ -117,8 +119,8 @@ def __init__(
         self.nmsthre = exp.nmsthre
         self.test_size = exp.test_size
         self.device = device
-        self.fp16 = fp16
-        self.preproc = ValTransform(legacy=legacy)
+        #self.fp16 = fp16
+        #self.preproc = ValTransform(legacy=legacy)
         if trt_file is not None:
             from torch2trt import TRTModule
 
@@ -128,6 +130,9 @@ def __init__(
             x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
             self.model(x)
             self.model = model_trt
+        self.rgb_means = (0.485, 0.456, 0.406)
+        self.std = (0.229, 0.224, 0.225)
+
 
     def inference(self, img):
         img_info = {"id": 0}
@@ -141,9 +146,33 @@ def inference(self, img):
         img_info["height"] = height
         img_info["width"] = width
         img_info["raw_img"] = img
-
+        
+        #img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT,value=(0,0,0))
         #ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
         #img_info["ratio"] = ratio
+        """
+        if (img.shape[0]>exp.test_size[0]):
+            h_r = (img.shape[0]//exp.test_size[0]+1)*exp.test_size[0]-img.shape[0]
+        elif(img.shape[0]<exp.test_size[0]):
+            h_r = (exp.test_size[0]-img.shape[0])
+        else:
+            h_r = 0
+        if (img.shape[1]>exp.test_size[1]):
+            w_r = (img.shape[1]//exp.test_size[1]+1)*exp.test_size[1]-img.shape[1]
+        elif(img.shape[1]<exp.test_size[1]):
+            w_r = (exp.test_size[1]-img.shape[1])
+        else:
+            w_r = 0
+        top = h_r//2
+        bottom =  h_r-top
+        left = w_r//2
+        right =  w_r-left
+        print("top: ",top)
+        print("left: ",left)
+        print("original img size: ",img.shape)
+        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT,value=(0,0,0))              
+        print("after img size: ",img.shape)
+        """
         (winW, winH) = (exp.test_size[1], exp.test_size[0])
         (imgW, imgH)= (img.shape[1],img.shape[0])
         if (imgH%winH):
@@ -164,13 +193,13 @@ def inference(self, img):
             if window.shape[0] != winH or window.shape[1] != winW:
                 continue
 
-            Wimg, _ = self.preproc(window, None, self.test_size)
+            Wimg, _ = preproc(window, self.test_size, self.rgb_means, self.std)
             Wimg = torch.from_numpy(Wimg).unsqueeze(0)
             Wimg = Wimg.float()
             if self.device == "gpu":
                 Wimg = Wimg.cuda()
-                if self.fp16:
-                    Wimg = Wimg.half()  # to FP16
+                #if self.fp16:
+                #    Wimg = Wimg.half()  # to FP16
 
             with torch.no_grad():
                 t0 = time.time()
@@ -187,9 +216,9 @@ def inference(self, img):
             outputs = self.decoder(outputs, dtype=outputs.type())
         outputs = postprocess(
             outputs, self.num_classes, self.confthre,
-            self.nmsthre, class_agnostic=True
+            self.nmsthre#, class_agnostic=True
         )
-        
+        """
         if outputs[0] is None:
             pass
         elif len(outputs[0]) == 2:
@@ -204,8 +233,8 @@ def inference(self, img):
             temp[0][6] = torch.add(outputs[0][0, 6], outputs[0][1, 6]) / 2
             li_outputs.append(temp)
             outputs = li_outputs
-        
 
+        """
         logger.info("Infer time: {:.4f}s".format(time.time() - t0))
         return outputs, img_info
 
@@ -371,7 +400,7 @@ def main(exp, args):
 
     predictor = Predictor(
         model, exp, COCO_CLASSES, trt_file, decoder,
-        args.device, args.fp16, args.legacy,
+        args.device#, args.fp16, args.legacy,
     )
     current_time = time.localtime()
     if args.demo == "image":
@@ -384,4 +413,4 @@ def main(exp, args):
     args = make_parser().parse_args()
     exp = get_exp(args.exp_file, args.name)
 
-    main(exp, args)
\ No newline at end of file
+    main(exp, args)

From 09694321650b0eb728a105e64bda8fa018b56cbf Mon Sep 17 00:00:00 2001
From: "I-CHEN,LU" <39236354+lujulia@users.noreply.github.com>
Date: Wed, 27 Sep 2023 22:36:32 +0800
Subject: [PATCH 59/59] Update demo_sliding_window.py