Megvii-BaseDetection · ajayvohra2005 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -226,3 +226,7 @@ events.out.tfevents*
 .Trashes
 ehthumbs.db
 Thumbs.db
+
+cache
+*.out
+*.txt
diff --git a/Dockerfile.cuda b/Dockerfile.cuda
@@ -0,0 +1,19 @@
+FROM nvcr.io/nvidia/pytorch:24.09-py3
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+COPY . /yolox-x
+RUN pip3 install --upgrade pip
+RUN pip3 install -v -e /yolox-x
+RUN pip3 install opencv-python==4.8.0.74
+WORKDIR /app
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD ["python", "tools/train.py"]
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
@@ -0,0 +1,21 @@
+FROM public.ecr.aws/neuron/pytorch-training-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+ENV PJRT_DEVICE=NEURON
+
+RUN apt-get update && apt-get -y install python3-opencv
+COPY . /yolox-x
+RUN pip3 install -v -e /yolox-x
+RUN pip3 install protobuf==3.20.3
+WORKDIR /app
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD ["python", "tools/train.py"]
diff --git a/Dockerfile.xla b/Dockerfile.xla
@@ -0,0 +1,20 @@
+FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+ENV PJRT_DEVICE=CUDA
+
+RUN apt-get update && apt-get -y install python3-opencv
+COPY . /yolox-x
+RUN pip3 install -v -e /yolox-x
+WORKDIR /app
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD ["python", "tools/train.py"]
diff --git a/README.md b/README.md
@@ -66,11 +66,13 @@ This repo is an implementation of PyTorch version YOLOX, there is also a [MegEng
 <details>
 <summary>Installation</summary>
 
+Install `torch` version 2.4.0 and `torchvision` with Python 3.10 in a `conda` or virtualenv. Activate the `conda` or `virtualenv`.
+
 Step1. Install YOLOX from source.
 ```shell
-git clone git@github.com:Megvii-BaseDetection/YOLOX.git
-cd YOLOX
-pip3 install -v -e .  # or  python3 setup.py develop
+git clone https://github.com/ajayvohra2005/YOLOX-x.git
+cd YOLOX-x
+pip3 install -v -e . 
 ```
 
 </details>
@@ -83,79 +85,34 @@ Step1. Download a pretrained model from the benchmark table.
 Step2. Use either -n or -f to specify your detector's config. For example:
 
 ```shell
-python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
+python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result
 ```
 or
 ```shell
-python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
+python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result
 ```
 Demo for video:
 ```shell
-python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth --path /path/to/your/video --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
+python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth --path /path/to/your/video --conf 0.25 --nms 0.45 --tsize 640 --save_result
 ```
 
 
 </details>
 
 <details>
-<summary>Reproduce our results on COCO</summary>
-
-Step1. Prepare COCO dataset
-```shell
-cd <YOLOX_HOME>
-ln -s /path/to/your/COCO ./datasets/COCO
-```
-
-Step2. Reproduce our results on COCO by specifying -n:
-
-```shell
-python -m yolox.tools.train -n yolox-s -d 8 -b 64 --fp16 -o [--cache]
-                               yolox-m
-                               yolox-l
-                               yolox-x
-```
-* -d: number of gpu devices
-* -b: total batch size, the recommended number for -b is num-gpu * 8
-* --fp16: mixed precision training
-* --cache: caching imgs into RAM to accelarate training, which need large system RAM.
+<summary>Train on COCO</summary>
 
+    cd YOLOX_HOME
 
+Update `run-cuda.sh` script  to set `YOLOX_DATADIR` to your datasets directory, containing `COCO` folder with COCO2017 dataset. Update model name (default `yolox-s`) as needed.
 
-When using -f, the above commands are equivalent to:
-```shell
-python -m yolox.tools.train -f exps/default/yolox_s.py -d 8 -b 64 --fp16 -o [--cache]
-                               exps/default/yolox_m.py
-                               exps/default/yolox_l.py
-                               exps/default/yolox_x.py
-```
-
-**Multi Machine Training**
+    ./run-cuda.sh
 
-We also support multi-nodes training. Just add the following args:
-* --num\_machines: num of your total training nodes
-* --machine\_rank: specify the rank of each node
-
-Suppose you want to train YOLOX on 2 machines, and your master machines's IP is 123.123.123.123, use port 12312 and TCP.
-
-On master machine, run
-```shell
-python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 0
-```
-On the second machine, run
-```shell
-python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 1
-```
 
 **Logging to Weights & Biases**
 
 To log metrics, predictions and model checkpoints to [W&B](https://docs.wandb.ai/guides/integrations/other/yolox) use the command line argument `--logger wandb` and use the prefix "wandb-" to specify arguments for initializing the wandb run.
 
-```shell
-python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o [--cache] --logger wandb wandb-project <project name>
-                         yolox-m
-                         yolox-l
-                         yolox-x
-```
 
 An example wandb dashboard is available [here](https://wandb.ai/manan-goel/yolox-nano/runs/3pzfeom0)
 
@@ -175,7 +132,7 @@ python -m yolox.tools.train --help
 We support batch testing for fast evaluation:
 
 ```shell
-python -m yolox.tools.eval -n  yolox-s -c yolox_s.pth -b 64 -d 8 --conf 0.001 [--fp16] [--fuse]
+torchrun --standalone --nproc_per_node=8 yolox.tools.eval yolox-s -c yolox_s.pth -b 64 --conf 0.001 [--fp16] [--fuse]
                                yolox-m
                                yolox-l
                                yolox-x
@@ -186,7 +143,7 @@ python -m yolox.tools.eval -n  yolox-s -c yolox_s.pth -b 64 -d 8 --conf 0.001 [-
 
 To reproduce speed test, we use the following command:
 ```shell
-python -m yolox.tools.eval -n  yolox-s -c yolox_s.pth -b 1 -d 1 --conf 0.001 --fp16 --fuse
+python -m yolox.tools.eval -n  yolox-s -c yolox_s.pth -b 1  --conf 0.001 --fp16 --fuse
                                yolox-m
                                yolox-l
                                yolox-x

diff --git a/demo/MegEngine/python/models/darknet.py b/demo/MegEngine/python/models/darknet.py
@@ -3,9 +3,11 @@
 # Copyright (c) Megvii Inc. All rights reserved.
 
 import megengine.module as M
+from yolox.utils.device_utils import get_xla_model
 
 from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
 
+xm = get_xla_model()
 
 class Darknet(M.Module):
     # number of blocks from dark2 to dark5.
@@ -70,6 +72,10 @@ def make_spp_block(self, filters_list, in_filters):
         return m
 
     def forward(self, x):
+
+        if xm:
+            xm.mark_step()
+
         outputs = {}
         x = self.stem(x)
         outputs["stem"] = x
@@ -81,6 +87,10 @@ def forward(self, x):
         outputs["dark4"] = x
         x = self.dark5(x)
         outputs["dark5"] = x
+
+        if xm:
+            xm.mark_step()
+
         return {k: v for k, v in outputs.items() if k in self.out_features}
 
 
@@ -140,6 +150,10 @@ def __init__(
         )
 
     def forward(self, x):
+
+        if xm:
+            xm.mark_step()
+
         outputs = {}
         x = self.stem(x)
         outputs["stem"] = x
@@ -151,4 +165,8 @@ def forward(self, x):
         outputs["dark4"] = x
         x = self.dark5(x)
         outputs["dark5"] = x
+
+        if xm:
+            xm.mark_step()
+
         return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/demo/MegEngine/python/models/yolo_fpn.py b/demo/MegEngine/python/models/yolo_fpn.py
@@ -4,11 +4,14 @@
 
 import megengine.functional as F
 import megengine.module as M
+from yolox.utils.device_utils import get_xla_model
 
 from .darknet import Darknet
 from .network_blocks import BaseConv, UpSample
 
 
+xm = get_xla_model()
+
 class YOLOFPN(M.Module):
     """
     YOLOFPN module. Darknet 53 is the default backbone of this model.
@@ -59,6 +62,9 @@ def forward(self, inputs):
             Tuple[Tensor]: FPN output features..
         """
         #  backbone
+        if xm:
+            xm.mark_step()
+
         out_features = self.backbone(inputs)
         x2, x1, x0 = [out_features[f] for f in self.in_features]
 
@@ -75,4 +81,8 @@ def forward(self, inputs):
         out_dark3 = self.out2(x2_in)
 
         outputs = (out_dark3, out_dark4, x0)
+
+        if xm:
+            xm.mark_step()
+
         return outputs
diff --git a/demo/MegEngine/python/models/yolo_head.py b/demo/MegEngine/python/models/yolo_head.py
@@ -4,6 +4,7 @@
 
 import megengine.functional as F
 import megengine.module as M
+from yolox.utils.device_utils import parse_dtype
 
 from .network_blocks import BaseConv, DWConv
 
@@ -154,14 +155,16 @@ def forward(self, xin, labels=None, imgs=None):
             return outputs
 
     def get_output_and_grid(self, output, k, stride, dtype):
-        grid = self.grids[k]
 
+        device, dtype = parse_dtype(dtype)
+        grid = self.grids[k]
+
         batch_size = output.shape[0]
         n_ch = 5 + self.num_classes
         hsize, wsize = output.shape[-2:]
         if grid.shape[2:4] != output.shape[2:4]:
             yv, xv = meshgrid([F.arange(hsize), F.arange(wsize)])
-            grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2).type(dtype)
+            grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2).to(device=device, dtype=dtype)
             self.grids[k] = grid
 
         output = output.view(batch_size, self.n_anchors, n_ch, hsize, wsize)

diff --git a/demo/MegEngine/python/models/yolo_pafpn.py b/demo/MegEngine/python/models/yolo_pafpn.py
@@ -4,10 +4,12 @@
 
 import megengine.module as M
 import megengine.functional as F
+from yolox.utils.device_utils import get_xla_model
 
 from .darknet import CSPDarknet
 from .network_blocks import BaseConv, CSPLayer, DWConv, UpSample
 
+xm = get_xla_model()
 
 class YOLOPAFPN(M.Module):
     """
@@ -85,6 +87,10 @@ def forward(self, input):
         """
 
         #  backbone
+
+        if xm:
+            xm.mark_step()
+
         out_features = self.backbone(input)
         features = [out_features[f] for f in self.in_features]
         [x2, x1, x0] = features
@@ -108,4 +114,8 @@ def forward(self, input):
         pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
 
         outputs = (pan_out2, pan_out1, pan_out0)
+
+        if xm:
+            xm.mark_step()
+
         return outputs
diff --git a/demo/nebullvm/nebullvm_optimization.py b/demo/nebullvm/nebullvm_optimization.py
@@ -1,18 +1,19 @@
+from yolox.utils.device_utils import get_current_device
 import torch
 import time
 from nebullvm.api.functions import optimize_model # Install DL compilers
 from yolox.exp import get_exp
 
+device = get_current_device()
+
 # Get YOLO model
 exp = get_exp(None, 'yolox-s') # select model name
 model = exp.get_model()
-model.cuda()
+model.to(device=device)
 model.eval()
 
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
 # Create dummy data for the optimizer
-input_data =  [((torch.randn(1, 3, 640, 640).to(device), ), 0) for i in range(100)] 
+input_data =  [((torch.randn(1, 3, 640, 640).to(device=device), ), 0) for i in range(100)] 
 
 # ---------- Optimization ---------- 
 optimized_model = optimize_model(model, input_data=input_data, optimization_time="constrained")  # Optimization without performance loss
@@ -22,7 +23,7 @@
 # Select image to test the latency of the optimized model
 
 # Create dummy image
-img = torch.randn(1, 3, 640, 640).to(device)
+img = torch.randn(1, 3, 640, 640).to(device=device)
 
 # Check perfomance
 warmup_iters = 30

diff --git a/docker-cuda.sh b/docker-cuda.sh
@@ -0,0 +1 @@
+docker run -t -d -v /home/ubuntu/efs/datasets:/datasets -v /home/ubuntu/efs/git/YOLOX-x:/app --shm-size=16g --net=host --gpus all docker.io/library/yolox-x-cuda:latest  sleep infinity
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		docker run -t -d -v /home/ubuntu/efs/datasets:/datasets -v /home/ubuntu/efs/git/YOLOX-x:/app --shm-size=16g --net=host --gpus all docker.io/library/yolox-x-cuda:latest sleep infinity