diff --git a/.gitignore b/.gitignore
index 9842565a1..f7d827755 100644
--- a/.gitignore
+++ b/.gitignore
@@ -226,3 +226,7 @@ events.out.tfevents*
.Trashes
ehthumbs.db
Thumbs.db
+
+cache
+*.out
+*.txt
diff --git a/Dockerfile.cuda b/Dockerfile.cuda
new file mode 100644
index 000000000..0bdfc2d97
--- /dev/null
+++ b/Dockerfile.cuda
@@ -0,0 +1,19 @@
+FROM nvcr.io/nvidia/pytorch:24.09-py3
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+COPY . /yolox-x
+RUN pip3 install --upgrade pip
+RUN pip3 install -v -e /yolox-x
+RUN pip3 install opencv-python==4.8.0.74
+WORKDIR /app
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD ["python", "tools/train.py"]
\ No newline at end of file
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
new file mode 100644
index 000000000..eeede27fd
--- /dev/null
+++ b/Dockerfile.neuron
@@ -0,0 +1,21 @@
+FROM public.ecr.aws/neuron/pytorch-training-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+ENV PJRT_DEVICE=NEURON
+
+RUN apt-get update && apt-get -y install python3-opencv
+COPY . /yolox-x
+RUN pip3 install -v -e /yolox-x
+RUN pip3 install protobuf==3.20.3
+WORKDIR /app
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD ["python", "tools/train.py"]
\ No newline at end of file
diff --git a/Dockerfile.xla b/Dockerfile.xla
new file mode 100644
index 000000000..4599d53da
--- /dev/null
+++ b/Dockerfile.xla
@@ -0,0 +1,20 @@
+FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+ENV PJRT_DEVICE=CUDA
+
+RUN apt-get update && apt-get -y install python3-opencv
+COPY . /yolox-x
+RUN pip3 install -v -e /yolox-x
+WORKDIR /app
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD ["python", "tools/train.py"]
\ No newline at end of file
diff --git a/README.md b/README.md
index f83aa3c00..a4c1d4c87 100644
--- a/README.md
+++ b/README.md
@@ -66,11 +66,13 @@ This repo is an implementation of PyTorch version YOLOX, there is also a [MegEng
Installation
+Install `torch` version 2.4.0 and `torchvision` with Python 3.10 in a `conda` or virtualenv. Activate the `conda` or `virtualenv`.
+
Step1. Install YOLOX from source.
```shell
-git clone git@github.com:Megvii-BaseDetection/YOLOX.git
-cd YOLOX
-pip3 install -v -e . # or python3 setup.py develop
+git clone https://github.com/ajayvohra2005/YOLOX-x.git
+cd YOLOX-x
+pip3 install -v -e .
```
@@ -83,79 +85,34 @@ Step1. Download a pretrained model from the benchmark table.
Step2. Use either -n or -f to specify your detector's config. For example:
```shell
-python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
+python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result
```
or
```shell
-python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
+python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result
```
Demo for video:
```shell
-python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth --path /path/to/your/video --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
+python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth --path /path/to/your/video --conf 0.25 --nms 0.45 --tsize 640 --save_result
```
-Reproduce our results on COCO
-
-Step1. Prepare COCO dataset
-```shell
-cd
-ln -s /path/to/your/COCO ./datasets/COCO
-```
-
-Step2. Reproduce our results on COCO by specifying -n:
-
-```shell
-python -m yolox.tools.train -n yolox-s -d 8 -b 64 --fp16 -o [--cache]
- yolox-m
- yolox-l
- yolox-x
-```
-* -d: number of gpu devices
-* -b: total batch size, the recommended number for -b is num-gpu * 8
-* --fp16: mixed precision training
-* --cache: caching imgs into RAM to accelarate training, which need large system RAM.
+Train on COCO
+ cd YOLOX_HOME
+Update `run-cuda.sh` script to set `YOLOX_DATADIR` to your datasets directory, containing `COCO` folder with COCO2017 dataset. Update model name (default `yolox-s`) as needed.
-When using -f, the above commands are equivalent to:
-```shell
-python -m yolox.tools.train -f exps/default/yolox_s.py -d 8 -b 64 --fp16 -o [--cache]
- exps/default/yolox_m.py
- exps/default/yolox_l.py
- exps/default/yolox_x.py
-```
-
-**Multi Machine Training**
+ ./run-cuda.sh
-We also support multi-nodes training. Just add the following args:
-* --num\_machines: num of your total training nodes
-* --machine\_rank: specify the rank of each node
-
-Suppose you want to train YOLOX on 2 machines, and your master machines's IP is 123.123.123.123, use port 12312 and TCP.
-
-On master machine, run
-```shell
-python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 0
-```
-On the second machine, run
-```shell
-python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 1
-```
**Logging to Weights & Biases**
To log metrics, predictions and model checkpoints to [W&B](https://docs.wandb.ai/guides/integrations/other/yolox) use the command line argument `--logger wandb` and use the prefix "wandb-" to specify arguments for initializing the wandb run.
-```shell
-python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o [--cache] --logger wandb wandb-project
- yolox-m
- yolox-l
- yolox-x
-```
An example wandb dashboard is available [here](https://wandb.ai/manan-goel/yolox-nano/runs/3pzfeom0)
@@ -175,7 +132,7 @@ python -m yolox.tools.train --help
We support batch testing for fast evaluation:
```shell
-python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 64 -d 8 --conf 0.001 [--fp16] [--fuse]
+torchrun --standalone --nproc_per_node=8 yolox.tools.eval yolox-s -c yolox_s.pth -b 64 --conf 0.001 [--fp16] [--fuse]
yolox-m
yolox-l
yolox-x
@@ -186,7 +143,7 @@ python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 64 -d 8 --conf 0.001 [-
To reproduce speed test, we use the following command:
```shell
-python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 1 -d 1 --conf 0.001 --fp16 --fuse
+python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 1 --conf 0.001 --fp16 --fuse
yolox-m
yolox-l
yolox-x
diff --git a/demo/MegEngine/python/models/darknet.py b/demo/MegEngine/python/models/darknet.py
index 47469aa68..a896e8610 100644
--- a/demo/MegEngine/python/models/darknet.py
+++ b/demo/MegEngine/python/models/darknet.py
@@ -3,9 +3,11 @@
# Copyright (c) Megvii Inc. All rights reserved.
import megengine.module as M
+from yolox.utils.device_utils import get_xla_model
from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
+xm = get_xla_model()
class Darknet(M.Module):
# number of blocks from dark2 to dark5.
@@ -70,6 +72,10 @@ def make_spp_block(self, filters_list, in_filters):
return m
def forward(self, x):
+
+ if xm:
+ xm.mark_step()
+
outputs = {}
x = self.stem(x)
outputs["stem"] = x
@@ -81,6 +87,10 @@ def forward(self, x):
outputs["dark4"] = x
x = self.dark5(x)
outputs["dark5"] = x
+
+ if xm:
+ xm.mark_step()
+
return {k: v for k, v in outputs.items() if k in self.out_features}
@@ -140,6 +150,10 @@ def __init__(
)
def forward(self, x):
+
+ if xm:
+ xm.mark_step()
+
outputs = {}
x = self.stem(x)
outputs["stem"] = x
@@ -151,4 +165,8 @@ def forward(self, x):
outputs["dark4"] = x
x = self.dark5(x)
outputs["dark5"] = x
+
+ if xm:
+ xm.mark_step()
+
return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/demo/MegEngine/python/models/yolo_fpn.py b/demo/MegEngine/python/models/yolo_fpn.py
index 675a7f6e6..af6c2727c 100644
--- a/demo/MegEngine/python/models/yolo_fpn.py
+++ b/demo/MegEngine/python/models/yolo_fpn.py
@@ -4,11 +4,14 @@
import megengine.functional as F
import megengine.module as M
+from yolox.utils.device_utils import get_xla_model
from .darknet import Darknet
from .network_blocks import BaseConv, UpSample
+xm = get_xla_model()
+
class YOLOFPN(M.Module):
"""
YOLOFPN module. Darknet 53 is the default backbone of this model.
@@ -59,6 +62,9 @@ def forward(self, inputs):
Tuple[Tensor]: FPN output features..
"""
# backbone
+ if xm:
+ xm.mark_step()
+
out_features = self.backbone(inputs)
x2, x1, x0 = [out_features[f] for f in self.in_features]
@@ -75,4 +81,8 @@ def forward(self, inputs):
out_dark3 = self.out2(x2_in)
outputs = (out_dark3, out_dark4, x0)
+
+ if xm:
+ xm.mark_step()
+
return outputs
diff --git a/demo/MegEngine/python/models/yolo_head.py b/demo/MegEngine/python/models/yolo_head.py
index 7bba674d5..9c7b37d43 100644
--- a/demo/MegEngine/python/models/yolo_head.py
+++ b/demo/MegEngine/python/models/yolo_head.py
@@ -4,6 +4,7 @@
import megengine.functional as F
import megengine.module as M
+from yolox.utils.device_utils import parse_dtype
from .network_blocks import BaseConv, DWConv
@@ -154,14 +155,16 @@ def forward(self, xin, labels=None, imgs=None):
return outputs
def get_output_and_grid(self, output, k, stride, dtype):
- grid = self.grids[k]
+ device, dtype = parse_dtype(dtype)
+ grid = self.grids[k]
+
batch_size = output.shape[0]
n_ch = 5 + self.num_classes
hsize, wsize = output.shape[-2:]
if grid.shape[2:4] != output.shape[2:4]:
yv, xv = meshgrid([F.arange(hsize), F.arange(wsize)])
- grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2).type(dtype)
+ grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2).to(device=device, dtype=dtype)
self.grids[k] = grid
output = output.view(batch_size, self.n_anchors, n_ch, hsize, wsize)
diff --git a/demo/MegEngine/python/models/yolo_pafpn.py b/demo/MegEngine/python/models/yolo_pafpn.py
index 86154bfa9..fb6e591d0 100644
--- a/demo/MegEngine/python/models/yolo_pafpn.py
+++ b/demo/MegEngine/python/models/yolo_pafpn.py
@@ -4,10 +4,12 @@
import megengine.module as M
import megengine.functional as F
+from yolox.utils.device_utils import get_xla_model
from .darknet import CSPDarknet
from .network_blocks import BaseConv, CSPLayer, DWConv, UpSample
+xm = get_xla_model()
class YOLOPAFPN(M.Module):
"""
@@ -85,6 +87,10 @@ def forward(self, input):
"""
# backbone
+
+ if xm:
+ xm.mark_step()
+
out_features = self.backbone(input)
features = [out_features[f] for f in self.in_features]
[x2, x1, x0] = features
@@ -108,4 +114,8 @@ def forward(self, input):
pan_out0 = self.C3_n4(p_out0) # 1024->1024/32
outputs = (pan_out2, pan_out1, pan_out0)
+
+ if xm:
+ xm.mark_step()
+
return outputs
diff --git a/demo/nebullvm/nebullvm_optimization.py b/demo/nebullvm/nebullvm_optimization.py
index b817baf62..b9365dad2 100644
--- a/demo/nebullvm/nebullvm_optimization.py
+++ b/demo/nebullvm/nebullvm_optimization.py
@@ -1,18 +1,19 @@
+from yolox.utils.device_utils import get_current_device
import torch
import time
from nebullvm.api.functions import optimize_model # Install DL compilers
from yolox.exp import get_exp
+device = get_current_device()
+
# Get YOLO model
exp = get_exp(None, 'yolox-s') # select model name
model = exp.get_model()
-model.cuda()
+model.to(device=device)
model.eval()
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
# Create dummy data for the optimizer
-input_data = [((torch.randn(1, 3, 640, 640).to(device), ), 0) for i in range(100)]
+input_data = [((torch.randn(1, 3, 640, 640).to(device=device), ), 0) for i in range(100)]
# ---------- Optimization ----------
optimized_model = optimize_model(model, input_data=input_data, optimization_time="constrained") # Optimization without performance loss
@@ -22,7 +23,7 @@
# Select image to test the latency of the optimized model
# Create dummy image
-img = torch.randn(1, 3, 640, 640).to(device)
+img = torch.randn(1, 3, 640, 640).to(device=device)
# Check perfomance
warmup_iters = 30
diff --git a/docker-cuda.sh b/docker-cuda.sh
new file mode 100755
index 000000000..8272e2e15
--- /dev/null
+++ b/docker-cuda.sh
@@ -0,0 +1 @@
+docker run -t -d -v /home/ubuntu/efs/datasets:/datasets -v /home/ubuntu/efs/git/YOLOX-x:/app --shm-size=16g --net=host --gpus all docker.io/library/yolox-x-cuda:latest sleep infinity
\ No newline at end of file
diff --git a/docker-neuron.sh b/docker-neuron.sh
new file mode 100755
index 000000000..52aa59761
--- /dev/null
+++ b/docker-neuron.sh
@@ -0,0 +1,24 @@
+docker run -t -d \
+ -v /home/ubuntu/efs/datasets:/datasets \
+ -v /home/ubuntu/efs/git/YOLOX-x:/app \
+ -v /tmp:/cache \
+ --shm-size=16g \
+ --net=host \
+ --shm-size=16g \
+ --device=/dev/neuron0 \
+ --device=/dev/neuron1 \
+ --device=/dev/neuron2 \
+ --device=/dev/neuron3 \
+ --device=/dev/neuron4 \
+ --device=/dev/neuron5 \
+ --device=/dev/neuron6 \
+ --device=/dev/neuron7 \
+ --device=/dev/neuron8 \
+ --device=/dev/neuron9 \
+ --device=/dev/neuron10 \
+ --device=/dev/neuron11 \
+ --device=/dev/neuron12 \
+ --device=/dev/neuron13 \
+ --device=/dev/neuron14 \
+ --device=/dev/neuron15 \
+ docker.io/library/yolox-x-neuron:latest sleep infinity
\ No newline at end of file
diff --git a/docker-xla-cuda.sh b/docker-xla-cuda.sh
new file mode 100755
index 000000000..56cb32a94
--- /dev/null
+++ b/docker-xla-cuda.sh
@@ -0,0 +1 @@
+docker run -t -d -v /home/ubuntu/efs/datasets:/datasets -v /home/ubuntu/efs/git/YOLOX-x:/app --shm-size=16g --net=host --gpus all docker.io/library/yolox-x-xla:latest sleep infinity
\ No newline at end of file
diff --git a/run-cuda.sh b/run-cuda.sh
new file mode 100755
index 000000000..6ebef065d
--- /dev/null
+++ b/run-cuda.sh
@@ -0,0 +1,5 @@
+export OMP_NUM_THREADS=16
+export LOGURU_LEVEL="INFO"
+export YOLOX_DATADIR=/datasets
+export YOLOX_OUPUT_DIR="./YOLOX_cuda_outputs"
+torchrun --standalone --nproc_per_node=8 tools/train.py -b 32 -n yolox-s
diff --git a/run-neuron.sh b/run-neuron.sh
new file mode 100755
index 000000000..66d3a4978
--- /dev/null
+++ b/run-neuron.sh
@@ -0,0 +1,12 @@
+export YOLOX_DATADIR=/datasets
+export YOLOX_OUPUT_DIR="./YOLOX_neuron_outputs"
+export OMP_NUM_THREADS=16
+export LOGURU_LEVEL="INFO"
+export NEURON_CC_FLAGS="--cache_dir=/cache --model-type=generic"
+export NEURON_RT_STOCHASTIC_ROUNDING_EN="1"
+export XLA_IR_SHAPE_CACHE_SIZE="20480"
+export XLA_IR_DEBUG=0
+export XLA_HLO_DEBUG=0
+export PT_XLA_DEBUG=0
+export PT_XLA_DEBUG_FILE=./pt_xla_debug.txt
+torchrun --standalone --nproc_per_node=32 tools/train.py -b 128 -n yolox-s
diff --git a/run-xla-cuda.sh b/run-xla-cuda.sh
new file mode 100755
index 000000000..05a857fbd
--- /dev/null
+++ b/run-xla-cuda.sh
@@ -0,0 +1,9 @@
+export YOLOX_DATADIR=/datasets
+export YOLOX_OUPUT_DIR="./YOLOX_xla_cuda_outputs"
+export OMP_NUM_THREADS=16
+export LOGURU_LEVEL=INFO
+#export XLA_IR_DEBUG=1
+#export XLA_HLO_DEBUG=1
+#export PT_XLA_DEBUG=1
+#export PT_XLA_DEBUG_FILE=./pt_xla_debug.txt
+torchrun --standalone --nproc_per_node=8 tools/train.py -b 32 -n yolox-s
diff --git a/setup.py b/setup.py
index 5fec79764..b523dc0d8 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@ def get_long_description():
def get_ext_modules():
ext_module = []
- if sys.platform != "win32": # pre-compile ops on linux
+ if torch.cuda.is_available() and sys.platform != "win32": # pre-compile ops on linux
assert TORCH_AVAILABLE, "torch is required for pre-compiling ops, please install it first."
# if any other op is added, please also add it here
from yolox.layers import FastCOCOEvalOp
@@ -64,8 +64,8 @@ def get_cmd_class():
setuptools.setup(
name="yolox",
version=get_yolox_version(),
- author="megvii basedet team",
- url="https://github.com/Megvii-BaseDetection/YOLOX",
+ author="ajayvohra2005",
+ url="https://github.com/ajayvohra2005/YOLOX-x",
package_dir=get_package_dir(),
packages=setuptools.find_packages(exclude=("tests", "tools")) + list(get_package_dir().keys()),
python_requires=">=3.6",
@@ -82,7 +82,7 @@ def get_cmd_class():
],
project_urls={
"Documentation": "https://yolox.readthedocs.io",
- "Source": "https://github.com/Megvii-BaseDetection/YOLOX",
- "Tracker": "https://github.com/Megvii-BaseDetection/YOLOX/issues",
+ "Source": "https://github.com/ajayvohra2005/YOLOX-x",
+ "Tracker": "https://github.com/ajayvohra2005/YOLOX-x/issues",
},
)
diff --git a/tools/demo.py b/tools/demo.py
index b16598d5f..d1ea45e3f 100644
--- a/tools/demo.py
+++ b/tools/demo.py
@@ -9,6 +9,7 @@
import cv2
+from yolox.utils.device_utils import get_current_device
import torch
from yolox.data.data_augment import ValTransform
@@ -46,12 +47,6 @@ def make_parser():
help="please input your experiment description file",
)
parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
- parser.add_argument(
- "--device",
- default="cpu",
- type=str,
- help="device to run our model, can either be cpu or gpu",
- )
parser.add_argument("--conf", default=0.3, type=float, help="test conf")
parser.add_argument("--nms", default=0.3, type=float, help="test nms threshold")
parser.add_argument("--tsize", default=None, type=int, help="test img size")
@@ -105,7 +100,6 @@ def __init__(
cls_names=COCO_CLASSES,
trt_file=None,
decoder=None,
- device="cpu",
fp16=False,
legacy=False,
):
@@ -116,7 +110,6 @@ def __init__(
self.confthre = exp.test_conf
self.nmsthre = exp.nmsthre
self.test_size = exp.test_size
- self.device = device
self.fp16 = fp16
self.preproc = ValTransform(legacy=legacy)
if trt_file is not None:
@@ -125,7 +118,7 @@ def __init__(
model_trt = TRTModule()
model_trt.load_state_dict(torch.load(trt_file))
- x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
+ x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).to(device=get_current_device())
self.model(x)
self.model = model_trt
@@ -148,10 +141,10 @@ def inference(self, img):
img, _ = self.preproc(img, None, self.test_size)
img = torch.from_numpy(img).unsqueeze(0)
img = img.float()
- if self.device == "gpu":
- img = img.cuda()
- if self.fp16:
- img = img.half() # to FP16
+
+ img = img.to(device=get_current_device())
+ if self.fp16:
+ img = img.half() # to FP16
with torch.no_grad():
t0 = time.time()
@@ -253,9 +246,6 @@ def main(exp, args):
vis_folder = os.path.join(file_name, "vis_res")
os.makedirs(vis_folder, exist_ok=True)
- if args.trt:
- args.device = "gpu"
-
logger.info("Args: {}".format(args))
if args.conf is not None:
@@ -268,10 +258,10 @@ def main(exp, args):
model = exp.get_model()
logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
- if args.device == "gpu":
- model.cuda()
- if args.fp16:
- model.half() # to FP16
+
+ model.to(device=get_current_device())
+ if args.fp16:
+ model.half() # to FP16
model.eval()
if not args.trt:
@@ -304,7 +294,7 @@ def main(exp, args):
predictor = Predictor(
model, exp, COCO_CLASSES, trt_file, decoder,
- args.device, args.fp16, args.legacy,
+ args.fp16, args.legacy,
)
current_time = time.localtime()
if args.demo == "image":
diff --git a/tools/eval.py b/tools/eval.py
index 83ad76be8..e11b3864f 100644
--- a/tools/eval.py
+++ b/tools/eval.py
@@ -8,8 +8,9 @@
import warnings
from loguru import logger
+from yolox.utils.device_utils import get_current_device, get_xla_model, set_manual_seed
import torch
-import torch.backends.cudnn as cudnn
+
from torch.nn.parallel import DistributedDataParallel as DDP
from yolox.core import launch
@@ -23,6 +24,7 @@
setup_logger
)
+xm = get_xla_model()
def make_parser():
parser = argparse.ArgumentParser("YOLOX Eval")
@@ -30,25 +32,7 @@ def make_parser():
parser.add_argument("-n", "--name", type=str, default=None, help="model name")
# distributed
- parser.add_argument(
- "--dist-backend", default="nccl", type=str, help="distributed backend"
- )
- parser.add_argument(
- "--dist-url",
- default=None,
- type=str,
- help="url used to set up distributed training",
- )
parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size")
- parser.add_argument(
- "-d", "--devices", default=None, type=int, help="device for training"
- )
- parser.add_argument(
- "--num_machines", default=1, type=int, help="num of node for training"
- )
- parser.add_argument(
- "--machine_rank", default=0, type=int, help="node rank for multi-node training"
- )
parser.add_argument(
"-f",
"--exp_file",
@@ -113,20 +97,23 @@ def make_parser():
@logger.catch
-def main(exp, args, num_gpu):
+def main(exp, args):
+ assert (not args.trt or torch.cuda.is_available()), "--trt requires CUDA"
+
if args.seed is not None:
- random.seed(args.seed)
- torch.manual_seed(args.seed)
- cudnn.deterministic = True
- warnings.warn(
- "You have chosen to seed testing. This will turn on the CUDNN deterministic setting, "
- )
+ set_manual_seed(args.seed)
+ if torch.cuda.is_available():
+ torch.backends.cudnn.deterministic = True
+ warnings.warn(
+ "You have chosen to seed testing. This will turn on the CUDNN deterministic setting, "
+ )
- is_distributed = num_gpu > 1
+ is_distributed = torch.distributed.is_initialized()
- # set environment variables for distributed training
- configure_nccl()
- cudnn.benchmark = True
+ # set environment variables for distributed training for CUDA
+ if torch.cuda.is_available():
+ configure_nccl()
+ torch.backends.cudnn.benchmark = True
rank = get_local_rank()
@@ -153,8 +140,7 @@ def main(exp, args, num_gpu):
evaluator.per_class_AP = True
evaluator.per_class_AR = True
- torch.cuda.set_device(rank)
- model.cuda(rank)
+ model.to(device=get_current_device())
model.eval()
if not args.speed and not args.trt:
@@ -163,13 +149,17 @@ def main(exp, args, num_gpu):
else:
ckpt_file = args.ckpt
logger.info("loading checkpoint from {}".format(ckpt_file))
- loc = "cuda:{}".format(rank)
+ loc = get_current_device()
ckpt = torch.load(ckpt_file, map_location=loc)
model.load_state_dict(ckpt["model"])
logger.info("loaded checkpoint done.")
if is_distributed:
- model = DDP(model, device_ids=[rank])
+ if xm:
+ xm.mark_step()
+ model = DDP(model, gradient_as_bucket_view=True)
+ else:
+ model = DDP(model)
if args.fuse:
logger.info("\tFusing model...")
@@ -205,16 +195,4 @@ def main(exp, args, num_gpu):
if not args.experiment_name:
args.experiment_name = exp.exp_name
- num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
- assert num_gpu <= torch.cuda.device_count()
-
- dist_url = "auto" if args.dist_url is None else args.dist_url
- launch(
- main,
- num_gpu,
- args.num_machines,
- args.machine_rank,
- backend=args.dist_backend,
- dist_url=dist_url,
- args=(exp, args, num_gpu),
- )
+ launch(main,args=(exp, args))
diff --git a/tools/train.py b/tools/train.py
index aa98bba30..8dd39946f 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -3,16 +3,15 @@
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
-import random
import warnings
from loguru import logger
import torch
-import torch.backends.cudnn as cudnn
from yolox.core import launch
from yolox.exp import Exp, check_exp_value, get_exp
-from yolox.utils import configure_module, configure_nccl, configure_omp, get_num_devices
+from yolox.utils import configure_module, configure_nccl, configure_omp
+from yolox.utils.device_utils import set_manual_seed
def make_parser():
@@ -21,19 +20,7 @@ def make_parser():
parser.add_argument("-n", "--name", type=str, default=None, help="model name")
# distributed
- parser.add_argument(
- "--dist-backend", default="nccl", type=str, help="distributed backend"
- )
- parser.add_argument(
- "--dist-url",
- default=None,
- type=str,
- help="url used to set up distributed training",
- )
parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size")
- parser.add_argument(
- "-d", "--devices", default=None, type=int, help="device for training"
- )
parser.add_argument(
"-f",
"--exp_file",
@@ -52,12 +39,6 @@ def make_parser():
type=int,
help="resume training start epoch",
)
- parser.add_argument(
- "--num_machines", default=1, type=int, help="num of node for training"
- )
- parser.add_argument(
- "--machine_rank", default=0, type=int, help="node rank for multi-node training"
- )
parser.add_argument(
"--fp16",
dest="fp16",
@@ -99,20 +80,24 @@ def make_parser():
@logger.catch
def main(exp: Exp, args):
+
+ assert (not args.occupy or torch.cuda.is_available()), "--occupy requires CUDA"
+
if exp.seed is not None:
- random.seed(exp.seed)
- torch.manual_seed(exp.seed)
- cudnn.deterministic = True
- warnings.warn(
- "You have chosen to seed training. This will turn on the CUDNN deterministic setting, "
- "which can slow down your training considerably! You may see unexpected behavior "
- "when restarting from checkpoints."
- )
-
- # set environment variables for distributed training
- configure_nccl()
- configure_omp()
- cudnn.benchmark = True
+ set_manual_seed(exp.seed)
+ if torch.cuda.is_available():
+ torch.backends.cudnn.deterministic = True
+ warnings.warn(
+ "You have chosen to seed training. This will turn on the CUDNN deterministic setting, "
+ "which can slow down your training considerably! You may see unexpected behavior "
+ "when restarting from checkpoints."
+ )
+
+ # set environment variables for distributed training for CUDA
+ if torch.cuda.is_available():
+ configure_nccl()
+ configure_omp()
+ torch.backends.cudnn.benchmark = True
trainer = exp.get_trainer(args)
trainer.train()
@@ -128,19 +113,8 @@ def main(exp: Exp, args):
if not args.experiment_name:
args.experiment_name = exp.exp_name
- num_gpu = get_num_devices() if args.devices is None else args.devices
- assert num_gpu <= get_num_devices()
-
if args.cache is not None:
+ logger.info(f"Dataset cache: {args.cache}; loading dataset before launch")
exp.dataset = exp.get_dataset(cache=True, cache_type=args.cache)
- dist_url = "auto" if args.dist_url is None else args.dist_url
- launch(
- main,
- num_gpu,
- args.num_machines,
- args.machine_rank,
- backend=args.dist_backend,
- dist_url=dist_url,
- args=(exp, args),
- )
+ launch(main,args=(exp, args))
diff --git a/tools/trt.py b/tools/trt.py
index f2f6cee5c..b4b5f1cff 100644
--- a/tools/trt.py
+++ b/tools/trt.py
@@ -7,8 +7,10 @@
import shutil
from loguru import logger
-import tensorrt as trt
import torch
+import tensorrt as trt
+from yolox.utils.device_utils import get_current_device
+
from torch2trt import torch2trt
from yolox.exp import get_exp
@@ -56,9 +58,9 @@ def main():
model.load_state_dict(ckpt["model"])
logger.info("loaded checkpoint done.")
model.eval()
- model.cuda()
+ model.to(device=get_current_device())
model.head.decode_in_inference = False
- x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
+ x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).to(device=get_current_device())
model_trt = torch2trt(
model,
[x],
diff --git a/tools/visualize_assign.py b/tools/visualize_assign.py
index e75a5586b..2b9bdcfc4 100644
--- a/tools/visualize_assign.py
+++ b/tools/visualize_assign.py
@@ -8,6 +8,7 @@
import warnings
from loguru import logger
+from yolox.utils.device_utils import get_current_device_type
import torch
import torch.backends.cudnn as cudnn
@@ -35,7 +36,7 @@ def train_one_iter(self):
inps, targets = self.exp.preprocess(inps, targets, self.input_size)
data_end_time = time.time()
- with torch.cuda.amp.autocast(enabled=self.amp_training):
+ with torch.autocast(get_current_device_type(), enabled=self.amp_training):
path_prefix = os.path.join(self.vis_dir, f"assign_vis_{self.batch_cnt}_")
self.model.visualize(inps, targets, path_prefix)
diff --git a/yolox/core/launch.py b/yolox/core/launch.py
index 9f8eec61e..461d3165e 100644
--- a/yolox/core/launch.py
+++ b/yolox/core/launch.py
@@ -5,143 +5,32 @@
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) Megvii, Inc. and its affiliates.
-import sys
-from datetime import timedelta
-from loguru import logger
+import os
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
+from yolox.utils.dist import barrier, deinit_distributed, init_distributed
-import yolox.utils.dist as comm
__all__ = ["launch"]
-DEFAULT_TIMEOUT = timedelta(minutes=30)
-
-
-def _find_free_port():
- """
- Find an available port of current machine / node.
- """
- import socket
-
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- # Binding to port 0 will cause the OS to find an available port for us
- sock.bind(("", 0))
- port = sock.getsockname()[1]
- sock.close()
- # NOTE: there is still a chance the port could be taken by other processes.
- return port
-
-
def launch(
main_func,
- num_gpus_per_machine,
- num_machines=1,
- machine_rank=0,
- backend="nccl",
- dist_url=None,
- args=(),
- timeout=DEFAULT_TIMEOUT,
+ args=()
):
"""
Args:
main_func: a function that will be called by `main_func(*args)`
- num_machines (int): the total number of machines
- machine_rank (int): the rank of this machine (one per machine)
- dist_url (str): url to connect to for distributed training, including protocol
- e.g. "tcp://127.0.0.1:8686".
- Can be set to auto to automatically select a free port on localhost
args (tuple): arguments passed to main_func
"""
- world_size = num_machines * num_gpus_per_machine
+ world_size = int(os.getenv("WORLD_SIZE", 1))
+ rank = int(os.getenv("RANK", 0))
if world_size > 1:
- # https://github.com/pytorch/pytorch/pull/14391
- # TODO prctl in spawned processes
-
- if dist_url == "auto":
- assert (
- num_machines == 1
- ), "dist_url=auto cannot work with distributed training."
- port = _find_free_port()
- dist_url = f"tcp://127.0.0.1:{port}"
-
- start_method = "spawn"
- cache = vars(args[1]).get("cache", False)
-
- # To use numpy memmap for caching image into RAM, we have to use fork method
- if cache:
- assert sys.platform != "win32", (
- "As Windows platform doesn't support fork method, "
- "do not add --cache in your training command."
- )
- start_method = "fork"
+ init_distributed(world_size=world_size, rank=rank)
- mp.start_processes(
- _distributed_worker,
- nprocs=num_gpus_per_machine,
- args=(
- main_func,
- world_size,
- num_gpus_per_machine,
- machine_rank,
- backend,
- dist_url,
- args,
- ),
- daemon=False,
- start_method=start_method,
- )
- else:
+ barrier()
main_func(*args)
-
-
-def _distributed_worker(
- local_rank,
- main_func,
- world_size,
- num_gpus_per_machine,
- machine_rank,
- backend,
- dist_url,
- args,
- timeout=DEFAULT_TIMEOUT,
-):
- assert (
- torch.cuda.is_available()
- ), "cuda is not available. Please check your installation."
- global_rank = machine_rank * num_gpus_per_machine + local_rank
- logger.info("Rank {} initialization finished.".format(global_rank))
- try:
- dist.init_process_group(
- backend=backend,
- init_method=dist_url,
- world_size=world_size,
- rank=global_rank,
- timeout=timeout,
- )
- except Exception:
- logger.error("Process group URL: {}".format(dist_url))
- raise
-
- # Setup the local process group (which contains ranks within the same machine)
- assert comm._LOCAL_PROCESS_GROUP is None
- num_machines = world_size // num_gpus_per_machine
- for i in range(num_machines):
- ranks_on_i = list(
- range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
- )
- pg = dist.new_group(ranks_on_i)
- if i == machine_rank:
- comm._LOCAL_PROCESS_GROUP = pg
-
- # synchronize is needed here to prevent a possible timeout after calling init_process_group
- # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
- comm.synchronize()
-
- assert num_gpus_per_machine <= torch.cuda.device_count()
- torch.cuda.set_device(local_rank)
-
- main_func(*args)
+ barrier()
+
+ deinit_distributed()
+ else:
+ main_func(*args)
\ No newline at end of file
diff --git a/yolox/core/trainer.py b/yolox/core/trainer.py
index 8f8016e57..444ec8fd4 100644
--- a/yolox/core/trainer.py
+++ b/yolox/core/trainer.py
@@ -6,6 +6,7 @@
import time
from loguru import logger
+from yolox.utils.device_utils import get_current_device, get_current_device_type, get_xla_model
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.tensorboard import SummaryWriter
@@ -34,6 +35,8 @@
)
+xm = get_xla_model()
+
class Trainer:
def __init__(self, exp: Exp, args):
# init function only defines some basic attr, other attrs like model, optimizer are built in
@@ -44,11 +47,20 @@ def __init__(self, exp: Exp, args):
# training related attr
self.max_epoch = exp.max_epoch
self.amp_training = args.fp16
- self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
+ if hasattr(torch, "GradScaler"):
+ self.scaler = torch.GradScaler(get_current_device_type(), enabled=args.fp16)
+ elif xm:
+ from torch_xla.amp import GradScaler
+ self.scaler = GradScaler(enabled=args.fp16)
+ elif torch.cuda.is_available():
+ self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
+ else:
+ self.scaler = torch.cpu.amp.GradScaler(enabled=args.fp16)
+
self.is_distributed = get_world_size() > 1
self.rank = get_rank()
self.local_rank = get_local_rank()
- self.device = "cuda:{}".format(self.local_rank)
+ self.device = get_current_device()
self.use_model_ema = exp.ema
self.save_history_ckpt = exp.save_history_ckpt
@@ -94,25 +106,39 @@ def train_in_iter(self):
self.after_iter()
def train_one_iter(self):
+
iter_start_time = time.time()
-
+ logger.debug(f"iter start: {time.time()}")
inps, targets = self.prefetcher.next()
inps = inps.to(self.data_type)
targets = targets.to(self.data_type)
targets.requires_grad = False
inps, targets = self.exp.preprocess(inps, targets, self.input_size)
data_end_time = time.time()
-
- with torch.cuda.amp.autocast(enabled=self.amp_training):
+ logger.debug(f"input ready: {data_end_time}")
+
+ if xm:
+ inps = inps.to(device=self.device)
+ targets = targets.to(device=self.device)
+ logger.debug(f"input shape: {inps.shape}")
+
+ logger.debug(f"forward: {time.time()}")
+ with torch.autocast(get_current_device_type(), enabled=self.amp_training):
outputs = self.model(inps, targets)
loss = outputs["total_loss"]
-
+ if xm:
+ loss = loss.to(device=self.device)
self.optimizer.zero_grad()
- self.scaler.scale(loss).backward()
+ scaled_loss = self.scaler.scale(loss)
+ logger.debug(f"backward: {time.time()}")
+ scaled_loss.backward()
self.scaler.step(self.optimizer)
self.scaler.update()
-
+ if xm:
+ xm.mark_step()
+ logger.debug(f"optimizer step: {time.time()}")
+
if self.use_model_ema:
self.ema_model.update(self.model)
@@ -127,13 +153,17 @@ def train_one_iter(self):
lr=lr,
**outputs,
)
+ if xm:
+ xm.mark_step()
+ logger.debug(f"iter end: {time.time()}")
+
+
def before_train(self):
logger.info("args: {}".format(self.args))
logger.info("exp value:\n{}".format(self.exp))
# model related init
- torch.cuda.set_device(self.local_rank)
model = self.exp.get_model()
logger.info(
"Model Summary: {}".format(get_model_info(model, self.exp.test_size))
@@ -166,7 +196,11 @@ def before_train(self):
occupy_mem(self.local_rank)
if self.is_distributed:
- model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False)
+ if xm:
+ xm.mark_step()
+ model = DDP(model, broadcast_buffers=False, gradient_as_bucket_view=True)
+ else:
+ model = DDP(model, broadcast_buffers=False)
if self.use_model_ema:
self.ema_model = ModelEMA(model, 0.9998)
@@ -265,7 +299,10 @@ def after_iter(self):
["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()]
)
- mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage())
+ if torch.cuda.is_available():
+ mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage())
+ else:
+ mem_str = "mem: {:.1f}Gb".format(mem_usage())
logger.info(
"{}, {}, {}, {}, lr: {:.3e}".format(
@@ -299,7 +336,7 @@ def after_iter(self):
self.meter.clear_meters()
# random resizing
- if (self.progress_in_iter + 1) % 10 == 0:
+ if (self.progress_in_iter + 1) % self.exp.random_size_interval == 0:
self.input_size = self.exp.random_resize(
self.train_loader, self.epoch, self.rank, self.is_distributed
)
@@ -381,6 +418,7 @@ def evaluate_and_save_model(self):
logger.info("\n" + summary)
synchronize()
+ logger.info(f"Save checkpoints start: {time.time()}")
self.save_ckpt("last_epoch", update_best_ckpt, ap=ap50_95)
if self.save_history_ckpt:
self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)
@@ -396,6 +434,8 @@ def evaluate_and_save_model(self):
self.mlflow_logger.save_checkpoints(self.args, self.exp, self.file_name, self.epoch,
metadata, update_best_ckpt)
+ logger.info(f"Save checkpoints end: {time.time()}")
+
def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
if self.rank == 0:
save_model = self.ema_model.ema if self.use_model_ema else self.model
diff --git a/yolox/data/data_prefetcher.py b/yolox/data/data_prefetcher.py
index a118cf4e4..c9b299a18 100644
--- a/yolox/data/data_prefetcher.py
+++ b/yolox/data/data_prefetcher.py
@@ -4,7 +4,6 @@
import torch
-
class DataPrefetcher:
"""
DataPrefetcher is inspired by code of following file:
@@ -15,9 +14,11 @@ class DataPrefetcher:
def __init__(self, loader):
self.loader = iter(loader)
- self.stream = torch.cuda.Stream()
- self.input_cuda = self._input_cuda_for_image
- self.record_stream = DataPrefetcher._record_stream_for_image
+
+ if torch.cuda.is_available():
+ self.stream = torch.cuda.Stream()
+ self.input_cuda = self._input_cuda_for_image
+ self.record_stream = DataPrefetcher._record_stream_for_image
self.preload()
def preload(self):
@@ -28,18 +29,26 @@ def preload(self):
self.next_target = None
return
- with torch.cuda.stream(self.stream):
- self.input_cuda()
- self.next_target = self.next_target.cuda(non_blocking=True)
+ if torch.cuda.is_available():
+ with torch.cuda.stream(self.stream):
+ self.input_cuda()
+ self.next_target = self.next_target.cuda(non_blocking=True)
+ else:
+ self.next_input = self.next_input
+ self.next_target = self.next_target
def next(self):
- torch.cuda.current_stream().wait_stream(self.stream)
- input = self.next_input
- target = self.next_target
- if input is not None:
- self.record_stream(input)
- if target is not None:
- target.record_stream(torch.cuda.current_stream())
+ if torch.cuda.is_available():
+ torch.cuda.current_stream().wait_stream(self.stream)
+ input = self.next_input
+ target = self.next_target
+ if input is not None:
+ self.record_stream(input)
+ if target is not None:
+ target.record_stream(torch.cuda.current_stream())
+ else:
+ input = self.next_input
+ target = self.next_target
self.preload()
return input, target
diff --git a/yolox/evaluators/coco_evaluator.py b/yolox/evaluators/coco_evaluator.py
index e218c7456..f76a25932 100644
--- a/yolox/evaluators/coco_evaluator.py
+++ b/yolox/evaluators/coco_evaluator.py
@@ -26,7 +26,10 @@
time_synchronized,
xyxy2xywh
)
+from yolox.utils.device_utils import get_current_device, get_xla_model
+from yolox.utils.dist import _get_global_gloo_group, get_rank
+xm = get_xla_model()
def per_class_AR_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AR"], colums=6):
per_class_AR = {}
@@ -132,14 +135,14 @@ def evaluate(
summary (sr): summary info of evaluation.
"""
# TODO half to amp_test
- tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
+ tensor_type = torch.float16 if half else torch.float32
model = model.eval()
if half:
model = model.half()
ids = []
data_list = []
output_data = defaultdict()
- progress_bar = tqdm if is_main_process() else iter
+ progress_bar = tqdm # if is_main_process() else iter
inference_time = 0
nms_time = 0
@@ -155,11 +158,12 @@ def evaluate(
model(x)
model = model_trt
+ model = model.to(device=get_current_device())
for cur_iter, (imgs, _, info_imgs, ids) in enumerate(
progress_bar(self.dataloader)
):
with torch.no_grad():
- imgs = imgs.type(tensor_type)
+ imgs = imgs.type(tensor_type).to(device=get_current_device())
# skip the last iters since batchsize might be not enough for batch inference
is_time_record = cur_iter < len(self.dataloader) - 1
@@ -185,17 +189,23 @@ def evaluate(
outputs, info_imgs, ids, return_outputs=True)
data_list.extend(data_list_elem)
output_data.update(image_wise_data)
-
- statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
+
+ statistics = torch.tensor([inference_time, nms_time, n_samples],
+ dtype=torch.float32,
+ device=get_current_device())
if distributed:
# different process/device might have different speed,
# to make sure the process will not be stucked, sync func is used here.
- synchronize()
- data_list = gather(data_list, dst=0)
- output_data = gather(output_data, dst=0)
+ group = _get_global_gloo_group()
+ synchronize(group=group)
+ data_list = gather(data_list, dst=0, group=group)
+ output_data = gather(output_data, dst=0, group=group)
data_list = list(itertools.chain(*data_list))
output_data = dict(ChainMap(*output_data))
- torch.distributed.reduce(statistics, dst=0)
+ if xm:
+ torch.distributed.all_reduce(statistics)
+ else:
+ torch.distributed.reduce(statistics, dst=0)
eval_results = self.evaluate_prediction(data_list, statistics)
synchronize()
@@ -256,7 +266,10 @@ def evaluate_prediction(self, data_dict, statistics):
if not is_main_process():
return 0, 0, None
- logger.info("Evaluate in main process...")
+ if xm:
+ xm.mark_step()
+
+ logger.info(f"Evaluate in main process: data_dict length: {len(data_dict)}, statistics: {statistics}")
annType = ["segm", "bbox", "keypoints"]
@@ -278,7 +291,7 @@ def evaluate_prediction(self, data_dict, statistics):
)
info = time_info + "\n"
-
+ logger.info(f"time_info: {info} {time.time()}")
# Evaluate the Dt (detection) json comparing with the ground truth
if len(data_dict) > 0:
cocoGt = self.dataloader.dataset.coco
@@ -289,6 +302,7 @@ def evaluate_prediction(self, data_dict, statistics):
else:
_, tmp = tempfile.mkstemp()
json.dump(data_dict, open(tmp, "w"))
+ logger.info(f"load eval data: {tmp} {time.time()}")
cocoDt = cocoGt.loadRes(tmp)
try:
from yolox.layers import COCOeval_opt as COCOeval
@@ -298,10 +312,13 @@ def evaluate_prediction(self, data_dict, statistics):
logger.warning("Use standard COCOeval.")
cocoEval = COCOeval(cocoGt, cocoDt, annType[1])
+ logger.info(f"evaluate: {time.time()}")
cocoEval.evaluate()
+ logger.info(f"accumulate: {time.time()}")
cocoEval.accumulate()
redirect_string = io.StringIO()
with contextlib.redirect_stdout(redirect_string):
+ logger.info(f"summarize: {time.time()}")
cocoEval.summarize()
info += redirect_string.getvalue()
cat_ids = list(cocoGt.cats.keys())
@@ -312,6 +329,7 @@ def evaluate_prediction(self, data_dict, statistics):
if self.per_class_AR:
AR_table = per_class_AR_table(cocoEval, class_names=cat_names)
info += "per class AR:\n" + AR_table + "\n"
+ logger.info(f"info completed: {time.time()}")
return cocoEval.stats[0], cocoEval.stats[1], info
else:
return 0, 0, info
diff --git a/yolox/evaluators/voc_evaluator.py b/yolox/evaluators/voc_evaluator.py
index 094df3d69..619f35c5b 100644
--- a/yolox/evaluators/voc_evaluator.py
+++ b/yolox/evaluators/voc_evaluator.py
@@ -14,7 +14,9 @@
import torch
from yolox.utils import gather, is_main_process, postprocess, synchronize, time_synchronized
+from yolox.utils.device_utils import get_current_device, get_xla_model
+xm = get_xla_model()
class VOCEvaluator:
"""
@@ -57,7 +59,7 @@ def evaluate(
summary (sr): summary info of evaluation.
"""
# TODO half to amp_test
- tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
+ tensor_type = torch.float16 if half else torch.float32
model = model.eval()
if half:
model = model.half()
@@ -81,7 +83,7 @@ def evaluate(
for cur_iter, (imgs, _, info_imgs, ids) in enumerate(progress_bar(self.dataloader)):
with torch.no_grad():
- imgs = imgs.type(tensor_type)
+ imgs = imgs.type(tensor_type).to(device=get_current_device())
# skip the last iters since batchsize might be not enough for batch inference
is_time_record = cur_iter < len(self.dataloader) - 1
@@ -105,11 +107,16 @@ def evaluate(
data_list.update(self.convert_to_voc_format(outputs, info_imgs, ids))
- statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
+ statistics = torch.tensor([inference_time, nms_time, n_samples],
+ dtype=torch.float32,
+ device=get_current_device())
if distributed:
data_list = gather(data_list, dst=0)
data_list = ChainMap(*data_list)
- torch.distributed.reduce(statistics, dst=0)
+ if xm:
+ torch.distributed.all_reduce(statistics)
+ else:
+ torch.distributed.reduce(statistics, dst=0)
eval_results = self.evaluate_prediction(data_list, statistics)
synchronize()
diff --git a/yolox/exp/base_exp.py b/yolox/exp/base_exp.py
index 7ccfec5c2..85157750d 100644
--- a/yolox/exp/base_exp.py
+++ b/yolox/exp/base_exp.py
@@ -2,6 +2,7 @@
# Copyright (c) Megvii Inc. All rights reserved.
import ast
+import os
import pprint
from abc import ABCMeta, abstractmethod
from typing import Dict, List, Tuple
@@ -17,8 +18,8 @@ class BaseExp(metaclass=ABCMeta):
"""Basic class for any experiment."""
def __init__(self):
- self.seed = None
- self.output_dir = "./YOLOX_outputs"
+ self.seed = 2024
+ self.output_dir = os.getenv("YOLOX_OUTPUT_DIR", "./YOLOX_outputs")
self.print_interval = 100
self.eval_interval = 10
self.dataset = None
diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
index 82e93c21b..e9707979f 100644
--- a/yolox/exp/yolox_base.py
+++ b/yolox/exp/yolox_base.py
@@ -4,14 +4,18 @@
import os
import random
+from yolox.utils.device_utils import get_current_device, get_xla_model
import torch
import torch.distributed as dist
import torch.nn as nn
+from yolox.utils.dist import barrier, synchronize
+
from .base_exp import BaseExp
__all__ = ["Exp", "check_exp_value"]
+xm = get_xla_model()
class Exp(BaseExp):
def __init__(self):
@@ -108,6 +112,8 @@ def __init__(self):
# nms threshold
self.nmsthre = 0.65
+ self.random_size_interval = 10
+
def get_model(self):
from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
@@ -222,7 +228,9 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: s
return train_loader
def random_resize(self, data_loader, epoch, rank, is_distributed):
- tensor = torch.LongTensor(2).cuda()
+
+ device = get_current_device()
+ tensor = torch.LongTensor(2).to(device=device)
if rank == 0:
size_factor = self.input_size[1] * 1.0 / self.input_size[0]
@@ -236,7 +244,7 @@ def random_resize(self, data_loader, epoch, rank, is_distributed):
tensor[1] = size[1]
if is_distributed:
- dist.barrier()
+ barrier()
dist.broadcast(tensor, 0)
input_size = (tensor[0].item(), tensor[1].item())
diff --git a/yolox/layers/__init__.py b/yolox/layers/__init__.py
index fc9cf5138..6c51e080d 100644
--- a/yolox/layers/__init__.py
+++ b/yolox/layers/__init__.py
@@ -5,7 +5,10 @@
# import torch first to make jit op work without `ImportError of libc10.so`
import torch # noqa
-from .jit_ops import FastCOCOEvalOp, JitOp
+try:
+ from .jit_ops import FastCOCOEvalOp, JitOp
+except ImportError:
+ pass
try:
from .fast_coco_eval_api import COCOeval_opt
diff --git a/yolox/layers/fast_coco_eval_api.py b/yolox/layers/fast_coco_eval_api.py
index 5f3aeb551..a6b9f6b77 100644
--- a/yolox/layers/fast_coco_eval_api.py
+++ b/yolox/layers/fast_coco_eval_api.py
@@ -11,6 +11,12 @@
import numpy as np
from pycocotools.cocoeval import COCOeval
+try:
+ import torch
+ assert torch.cuda.is_available()
+except AssertionError as e:
+ raise ImportError()
+
from .jit_ops import FastCOCOEvalOp
diff --git a/yolox/layers/jit_ops.py b/yolox/layers/jit_ops.py
index 0fdac4de2..0165640fc 100644
--- a/yolox/layers/jit_ops.py
+++ b/yolox/layers/jit_ops.py
@@ -10,6 +10,11 @@
__all__ = ["JitOp", "FastCOCOEvalOp"]
+try:
+ import torch
+ assert torch.cuda.is_available()
+except AssertionError as e:
+ raise ImportError()
class JitOp:
"""
diff --git a/yolox/models/build.py b/yolox/models/build.py
index 8edc87de9..5344b4bb0 100644
--- a/yolox/models/build.py
+++ b/yolox/models/build.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
+from yolox.utils.device_utils import get_current_device
import torch
from torch import nn
from torch.hub import load_state_dict_from_url
@@ -50,8 +51,7 @@ def create_yolox_model(name: str, pretrained: bool = True, num_classes: int = 80
from yolox.exp import get_exp, Exp
if device is None:
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
- device = torch.device(device)
+ device = get_current_device()
assert name in _CKPT_FULL_PATH or name == "yolox_custom", \
f"user should use one of value in {_CKPT_FULL_PATH.keys()} or \"yolox_custom\""
@@ -75,7 +75,7 @@ def create_yolox_model(name: str, pretrained: bool = True, num_classes: int = 80
ckpt = ckpt["model"]
yolox_model.load_state_dict(ckpt)
- yolox_model.to(device)
+ yolox_model.to(device=device)
return yolox_model
diff --git a/yolox/models/darknet.py b/yolox/models/darknet.py
index b3e053f16..0c1bfc634 100644
--- a/yolox/models/darknet.py
+++ b/yolox/models/darknet.py
@@ -4,8 +4,11 @@
from torch import nn
+from yolox.utils.device_utils import get_xla_model
+
from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
+xm = get_xla_model()
class Darknet(nn.Module):
# number of blocks from dark2 to dark5.
@@ -80,6 +83,10 @@ def make_spp_block(self, filters_list, in_filters):
return m
def forward(self, x):
+
+ if xm:
+ xm.mark_step()
+
outputs = {}
x = self.stem(x)
outputs["stem"] = x
@@ -91,6 +98,10 @@ def forward(self, x):
outputs["dark4"] = x
x = self.dark5(x)
outputs["dark5"] = x
+
+ if xm:
+ xm.mark_step()
+
return {k: v for k, v in outputs.items() if k in self.out_features}
@@ -165,6 +176,9 @@ def __init__(
)
def forward(self, x):
+ if xm:
+ xm.mark_step()
+
outputs = {}
x = self.stem(x)
outputs["stem"] = x
@@ -176,4 +190,8 @@ def forward(self, x):
outputs["dark4"] = x
x = self.dark5(x)
outputs["dark5"] = x
+
+ if xm:
+ xm.mark_step()
+
return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/yolox/models/losses.py b/yolox/models/losses.py
index 77b4d8ef7..7c414c76a 100644
--- a/yolox/models/losses.py
+++ b/yolox/models/losses.py
@@ -2,9 +2,14 @@
# -*- encoding: utf-8 -*-
# Copyright (c) Megvii Inc. All rights reserved.
+import time
+from loguru import logger
import torch
import torch.nn as nn
+from yolox.utils.device_utils import get_xla_model, parse_dtype
+
+xm = get_xla_model()
class IOUloss(nn.Module):
def __init__(self, reduction="none", loss_type="iou"):
@@ -12,8 +17,8 @@ def __init__(self, reduction="none", loss_type="iou"):
self.reduction = reduction
self.loss_type = loss_type
- def forward(self, pred, target):
- assert pred.shape[0] == target.shape[0]
+ def forward(self, pred, target):
+ assert pred.shape[0] == target.shape[0], f"pred shape: {pred.shape} target shape: {target.shape}"
pred = pred.view(-1, 4)
target = target.view(-1, 4)
@@ -27,7 +32,8 @@ def forward(self, pred, target):
area_p = torch.prod(pred[:, 2:], 1)
area_g = torch.prod(target[:, 2:], 1)
- en = (tl < br).type(tl.type()).prod(dim=1)
+ device, dtype = parse_dtype(tl.type())
+ en = (tl < br).to(device=device, dtype=dtype).prod(dim=1)
area_i = torch.prod(br - tl, 1) * en
area_u = area_p + area_g - area_i
iou = (area_i) / (area_u + 1e-16)
diff --git a/yolox/models/yolo_fpn.py b/yolox/models/yolo_fpn.py
index 224271f59..88f06ec84 100644
--- a/yolox/models/yolo_fpn.py
+++ b/yolox/models/yolo_fpn.py
@@ -5,10 +5,14 @@
import torch
import torch.nn as nn
+from yolox.utils.device_utils import get_xla_model
+
from .darknet import Darknet
from .network_blocks import BaseConv
+xm = get_xla_model()
+
class YOLOFPN(nn.Module):
"""
YOLOFPN module. Darknet 53 is the default backbone of this model.
@@ -65,6 +69,9 @@ def forward(self, inputs):
Tuple[Tensor]: FPN output features..
"""
# backbone
+ if xm:
+ xm.mark_step()
+
out_features = self.backbone(inputs)
x2, x1, x0 = [out_features[f] for f in self.in_features]
@@ -81,4 +88,8 @@ def forward(self, inputs):
out_dark3 = self.out2(x2_in)
outputs = (out_dark3, out_dark4, x0)
+
+ if xm:
+ xm.mark_step()
+
return outputs
diff --git a/yolox/models/yolo_head.py b/yolox/models/yolo_head.py
index 3e51768ee..d218266ab 100644
--- a/yolox/models/yolo_head.py
+++ b/yolox/models/yolo_head.py
@@ -3,6 +3,7 @@
# Copyright (c) Megvii Inc. All rights reserved.
import math
+import time
from loguru import logger
import torch
@@ -10,10 +11,12 @@
import torch.nn.functional as F
from yolox.utils import bboxes_iou, cxcywh2xyxy, meshgrid, visualize_assign
+from yolox.utils.device_utils import get_current_device, get_current_device_type, get_xla_model, parse_dtype
from .losses import IOUloss
from .network_blocks import BaseConv, DWConv
+xm = get_xla_model()
class YOLOXHead(nn.Module):
def __init__(
@@ -152,7 +155,7 @@ def forward(self, xin, labels=None, imgs=None):
x = self.stems[k](x)
cls_x = x
reg_x = x
-
+
cls_feat = cls_conv(cls_x)
cls_output = self.cls_preds[k](cls_feat)
@@ -187,12 +190,13 @@ def forward(self, xin, labels=None, imgs=None):
output = torch.cat(
[reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1
)
+ if xm:
+ xm.mark_step()
outputs.append(output)
-
+
if self.training:
return self.get_losses(
- imgs,
x_shifts,
y_shifts,
expanded_strides,
@@ -213,6 +217,7 @@ def forward(self, xin, labels=None, imgs=None):
return outputs
def get_output_and_grid(self, output, k, stride, dtype):
+ device, dtype = parse_dtype(dtype)
grid = self.grids[k]
batch_size = output.shape[0]
@@ -220,7 +225,7 @@ def get_output_and_grid(self, output, k, stride, dtype):
hsize, wsize = output.shape[-2:]
if grid.shape[2:4] != output.shape[2:4]:
yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
- grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype)
+ grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).to(device=device, dtype=dtype)
self.grids[k] = grid
output = output.view(batch_size, 1, n_ch, hsize, wsize)
@@ -233,6 +238,8 @@ def get_output_and_grid(self, output, k, stride, dtype):
return output, grid
def decode_outputs(self, outputs, dtype):
+ device, dtype = parse_dtype(dtype)
+
grids = []
strides = []
for (hsize, wsize), stride in zip(self.hw, self.strides):
@@ -242,9 +249,9 @@ def decode_outputs(self, outputs, dtype):
shape = grid.shape[:2]
strides.append(torch.full((*shape, 1), stride))
- grids = torch.cat(grids, dim=1).type(dtype)
- strides = torch.cat(strides, dim=1).type(dtype)
-
+ grids = torch.cat(grids, dim=1).to(device=device, dtype=dtype)
+ strides = torch.cat(strides, dim=1).to(device=device, dtype=dtype)
+
outputs = torch.cat([
(outputs[..., 0:2] + grids) * strides,
torch.exp(outputs[..., 2:4]) * strides,
@@ -254,7 +261,6 @@ def decode_outputs(self, outputs, dtype):
def get_losses(
self,
- imgs,
x_shifts,
y_shifts,
expanded_strides,
@@ -263,6 +269,16 @@ def get_losses(
origin_preds,
dtype,
):
+ if xm:
+ xm.mark_step()
+ outputs = outputs.cpu()
+ labels = labels.cpu()
+ x_shifts = [ t.cpu() for t in x_shifts]
+ y_shifts = [ t.cpu() for t in y_shifts]
+ expanded_strides = [ t.cpu() for t in expanded_strides]
+ if self.use_l1:
+ origin_preds = [ t.cpu() for t in origin_preds]
+
bbox_preds = outputs[:, :, :4] # [batch, n_anchors_all, 4]
obj_preds = outputs[:, :, 4:5] # [batch, n_anchors_all, 1]
cls_preds = outputs[:, :, 5:] # [batch, n_anchors_all, n_cls]
@@ -285,7 +301,7 @@ def get_losses(
num_fg = 0.0
num_gts = 0.0
-
+
for batch_idx in range(outputs.shape[0]):
num_gt = int(nlabel[batch_idx])
num_gts += num_gt
@@ -299,7 +315,7 @@ def get_losses(
gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5]
gt_classes = labels[batch_idx, :num_gt, 0]
bboxes_preds_per_image = bbox_preds[batch_idx]
-
+
try:
(
gt_matched_classes,
@@ -318,10 +334,10 @@ def get_losses(
y_shifts,
cls_preds,
obj_preds,
+ mode=get_current_device_type()
)
- except RuntimeError as e:
- # TODO: the string might change, consider a better way
- if "CUDA out of memory. " not in str(e):
+ except RuntimeError as e:
+ if xm or "CUDA out of memory. " not in str(e):
raise # RuntimeError might not caused by CUDA OOM
logger.error(
@@ -329,7 +345,10 @@ def get_losses(
CPU mode is applied in this batch. If you want to avoid this issue, \
try to reduce the batch size or image size."
)
- torch.cuda.empty_cache()
+
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+
(
gt_matched_classes,
fg_mask,
@@ -350,7 +369,8 @@ def get_losses(
"cpu",
)
- torch.cuda.empty_cache()
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
num_fg += num_fg_img
cls_target = F.one_hot(
@@ -432,11 +452,10 @@ def get_assignments(
y_shifts,
cls_preds,
obj_preds,
- mode="gpu",
- ):
-
- if mode == "cpu":
- print("-----------Using CPU for the Current Batch-------------")
+ mode=None,
+ ):
+
+ if mode == "cpu" or mode =="xla":
gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
gt_classes = gt_classes.cpu().float()
@@ -456,10 +475,10 @@ def get_assignments(
obj_preds_ = obj_preds[batch_idx][fg_mask]
num_in_boxes_anchor = bboxes_preds_per_image.shape[0]
- if mode == "cpu":
+ if mode == "cpu" or mode =="xla":
gt_bboxes_per_image = gt_bboxes_per_image.cpu()
bboxes_preds_per_image = bboxes_preds_per_image.cpu()
-
+
pair_wise_ious = bboxes_iou(gt_bboxes_per_image, bboxes_preds_per_image, False)
gt_cls_per_image = (
@@ -468,10 +487,10 @@ def get_assignments(
)
pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)
- if mode == "cpu":
+ if mode == "cpu" or mode =="xla":
cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu()
- with torch.cuda.amp.autocast(enabled=False):
+ with torch.autocast(get_current_device_type(), enabled=False):
cls_preds_ = (
cls_preds_.float().sigmoid_() * obj_preds_.float().sigmoid_()
).sqrt()
@@ -496,11 +515,13 @@ def get_assignments(
) = self.simota_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss
+ # do not move back for mode == "xla"
if mode == "cpu":
- gt_matched_classes = gt_matched_classes.cuda()
- fg_mask = fg_mask.cuda()
- pred_ious_this_matching = pred_ious_this_matching.cuda()
- matched_gt_inds = matched_gt_inds.cuda()
+ device = get_current_device()
+ gt_matched_classes = gt_matched_classes.to(device=device)
+ fg_mask = fg_mask.to(device=device)
+ pred_ious_this_matching = pred_ious_this_matching.to(device=device)
+ matched_gt_inds = matched_gt_inds.to(device=device)
return (
gt_matched_classes,
@@ -542,7 +563,7 @@ def get_geometry_constraint(
return anchor_filter, geometry_relation
def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
- matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+ matching_matrix = torch.zeros_like(cost, dtype=torch.uint8).to(device=pair_wise_ious.device)
n_candidate_k = min(10, pair_wise_ious.size(1))
topk_ious, _ = torch.topk(pair_wise_ious, n_candidate_k, dim=1)
@@ -556,7 +577,7 @@ def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
del topk_ious, dynamic_ks, pos_idx
anchor_matching_gt = matching_matrix.sum(0)
- # deal with the case that one anchor matches multiple ground-truths
+ # deal with the case that one anchor matches multiple ground-truths
if anchor_matching_gt.max() > 1:
multiple_match_mask = anchor_matching_gt > 1
_, cost_argmin = torch.min(cost[:, multiple_match_mask], dim=0)
diff --git a/yolox/models/yolo_pafpn.py b/yolox/models/yolo_pafpn.py
index 4c4e18a5c..99f5985f1 100644
--- a/yolox/models/yolo_pafpn.py
+++ b/yolox/models/yolo_pafpn.py
@@ -5,9 +5,12 @@
import torch
import torch.nn as nn
+from yolox.utils.device_utils import get_xla_model
+
from .darknet import CSPDarknet
from .network_blocks import BaseConv, CSPLayer, DWConv
+xm = get_xla_model()
class YOLOPAFPN(nn.Module):
"""
@@ -90,6 +93,9 @@ def forward(self, input):
"""
# backbone
+ if xm:
+ xm.mark_step()
+
out_features = self.backbone(input)
features = [out_features[f] for f in self.in_features]
[x2, x1, x0] = features
@@ -113,4 +119,8 @@ def forward(self, input):
pan_out0 = self.C3_n4(p_out0) # 1024->1024/32
outputs = (pan_out2, pan_out1, pan_out0)
+
+ if xm:
+ xm.mark_step()
+
return outputs
diff --git a/yolox/models/yolox.py b/yolox/models/yolox.py
index 744ceea81..42bc0a034 100644
--- a/yolox/models/yolox.py
+++ b/yolox/models/yolox.py
@@ -4,9 +4,12 @@
import torch.nn as nn
+from yolox.utils.device_utils import get_xla_model
+
from .yolo_head import YOLOXHead
from .yolo_pafpn import YOLOPAFPN
+xm = get_xla_model()
class YOLOX(nn.Module):
"""
@@ -27,6 +30,10 @@ def __init__(self, backbone=None, head=None):
def forward(self, x, targets=None):
# fpn output content features of [dark3, dark4, dark5]
+
+ if xm:
+ xm.mark_step()
+
fpn_outs = self.backbone(x)
if self.training:
@@ -45,6 +52,9 @@ def forward(self, x, targets=None):
else:
outputs = self.head(fpn_outs)
+ if xm:
+ xm.mark_step()
+
return outputs
def visualize(self, x, targets, save_prefix="assign_vis_"):
diff --git a/yolox/utils/allreduce_norm.py b/yolox/utils/allreduce_norm.py
index 142c76c78..4adf6dc65 100644
--- a/yolox/utils/allreduce_norm.py
+++ b/yolox/utils/allreduce_norm.py
@@ -5,11 +5,12 @@
import pickle
from collections import OrderedDict
+from yolox.utils.device_utils import get_current_device
import torch
from torch import distributed as dist
from torch import nn
-from .dist import _get_global_gloo_group, get_world_size
+from yolox.utils.dist import _get_global_gloo_group, get_world_size
ASYNC_NORM = (
nn.BatchNorm1d,
@@ -38,7 +39,8 @@ def get_async_norm_states(module):
return async_norm_states
-def pyobj2tensor(pyobj, device="cuda"):
+def pyobj2tensor(pyobj):
+ device = get_current_device()
"""serialize picklable python object to tensor"""
storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
return torch.ByteTensor(storage).to(device=device)
@@ -83,7 +85,7 @@ def all_reduce(py_dict, op="sum", group=None):
tensor_numels = [py_dict[k].numel() for k in py_key]
flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
- dist.all_reduce(flatten_tensor, op=_get_reduce_op(op))
+ dist.all_reduce(flatten_tensor, op=_get_reduce_op(op), group=group)
if op == "mean":
flatten_tensor /= world_size
diff --git a/yolox/utils/boxes.py b/yolox/utils/boxes.py
index f71e8d90b..07b27cf8e 100644
--- a/yolox/utils/boxes.py
+++ b/yolox/utils/boxes.py
@@ -6,6 +6,8 @@
import torch
import torchvision
+from yolox.utils.device_utils import parse_dtype
+
__all__ = [
"filter_box",
"postprocess",
@@ -97,7 +99,8 @@ def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
area_a = torch.prod(bboxes_a[:, 2:], 1)
area_b = torch.prod(bboxes_b[:, 2:], 1)
- en = (tl < br).type(tl.type()).prod(dim=2)
+ device, dtype = parse_dtype(tl.type())
+ en = (tl < br).to(device=device, dtype=dtype).prod(dim=2)
area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all())
return area_i / (area_a[:, None] + area_b - area_i)
diff --git a/yolox/utils/device_utils.py b/yolox/utils/device_utils.py
new file mode 100644
index 000000000..ab419d3e2
--- /dev/null
+++ b/yolox/utils/device_utils.py
@@ -0,0 +1,156 @@
+import os
+import random
+import warnings
+
+from typing import Union
+import torch
+
+try:
+ import torch_xla.core.xla_model as xm
+ import torch_xla.runtime as xr
+ import torch_xla.distributed.xla_backend as xb
+
+ compiler_cache_path = os.getenv("XLA_CACHE_DIR", "./cache")
+ os.makedirs(compiler_cache_path, exist_ok=True)
+ try:
+ xr.initialize_cache(compiler_cache_path, readonly=False)
+ except AttributeError as e:
+ warnings.warn(f"can not set XLA cache dir: {e}")
+
+except ImportError:
+ xm = None
+ xr = None
+ xb = None
+
+def get_xla_model():
+ return xm
+
+
+def get_xla_runtime():
+ return xr
+
+
+def get_current_device() -> torch.device:
+ global __current_device
+
+ try:
+ return __current_device
+ except NameError:
+ if xm is not None:
+ __current_device = xm.xla_device()
+ elif torch.cuda.is_available():
+ local_rank = int(os.getenv("LOCAL_RANK", 0))
+ __current_device = torch.device(f'cuda:{local_rank}')
+ torch.cuda.set_device(__current_device)
+ else:
+ device = os.getenv("DEFAULT_DEVICE", "cpu")
+ __current_device = torch.device(device)
+
+ return __current_device
+
+
+def get_current_device_type() -> str:
+ global __current_device_type
+
+ try:
+ return __current_device_type
+ except NameError:
+ if xm is not None:
+ __current_device_type = "xla"
+ elif torch.cuda.is_available():
+ __current_device_type = "cuda"
+ else:
+ __current_device_type = os.getenv("DEFAULT_DEVICE_TYPE", "cpu")
+
+ return __current_device_type
+
+
+def get_local_device_count() -> int:
+ device_count = 1
+
+ if xr is not None:
+ device_count = xr.global_device_count()
+ elif torch.cuda.is_available():
+ device_count = torch.cuda.device_count()
+
+ return device_count
+
+
+def get_distributed_backend(backend=None) -> str:
+ if xm is not None:
+ backend = "xla"
+ elif torch.cuda.is_available():
+ backend = backend if backend is not None else "nccl"
+ else:
+ backend = backend if backend is not None else "gloo"
+
+ return backend
+
+
+def get_distributed_init_method() -> str:
+ if xm is not None:
+ init_method = 'xla://'
+ else:
+ init_method = "env://"
+
+ return init_method
+
+
+def get_current_rng_state() -> Union[torch.Tensor, int]:
+ if torch.cuda.is_available():
+ rng_state = torch.cuda.get_rng_state(device=get_current_device())
+ elif xm:
+ rng_state = xm.get_rng_state(device=get_current_device())
+ else:
+ rng_state = torch.get_rng_state()
+
+ return rng_state
+
+
+def set_manual_seed(seed: int):
+ random.seed(seed)
+ if torch.cuda.is_available():
+ torch.cuda.manual_seed(seed)
+ elif xm is not None:
+ xm.set_rng_state(seed, device=get_current_device())
+ else:
+ torch.manual_seed(seed)
+
+
+def set_current_rng_state(new_state):
+ if torch.cuda.is_available():
+ new_state = new_state.type(torch.ByteTensor)
+ torch.cuda.set_rng_state(new_state, device=get_current_device())
+ elif xm is not None:
+ new_state = int(new_state)
+ xm.set_rng_state(new_state, device=get_current_device())
+ else:
+ new_state = new_state.type(torch.ByteTensor)
+ torch.set_rng_state(new_state)
+
+if xb:
+ def make_send_channel_id_impl(self, dst_rank, tag):
+ return int(dst_rank)*2
+
+ def make_recv_channel_id_impl(self, src_rank, tag):
+ return int(src_rank)*3
+
+ xb.ProcessGroupXla.make_send_channel_id = make_send_channel_id_impl
+ xb.ProcessGroupXla.make_recv_channel_id = make_recv_channel_id_impl
+
+def parse_dtype(dtype: str):
+ d, t = dtype.rsplit(".", 1)
+
+ assert d in ['torch', 'torch.cuda', 'torch.xla']
+ assert t in [ 'FloatTensor', 'HalfTensor', 'BFloat16Tensor']
+
+ if t == "FloatTensor":
+ dtype = torch.float32
+ elif t == "HalfTensor":
+ dtype = torch.float16
+ elif t == "BFloat16Tensor":
+ dtype = torch.bfloat16
+
+ device = torch.device("cpu") if d == "torch" else get_current_device()
+
+ return device, dtype
diff --git a/yolox/utils/dist.py b/yolox/utils/dist.py
index 9e8fea933..4160ec5a0 100644
--- a/yolox/utils/dist.py
+++ b/yolox/utils/dist.py
@@ -20,9 +20,9 @@
import torch
from torch import distributed as dist
+from yolox.utils.device_utils import get_current_device, get_distributed_backend, get_distributed_init_method, get_local_device_count, get_xla_model, xm
__all__ = [
- "get_num_devices",
"wait_for_the_master",
"is_main_process",
"synchronize",
@@ -35,18 +35,8 @@
"all_gather",
]
-_LOCAL_PROCESS_GROUP = None
-
-
-def get_num_devices():
- gpu_list = os.getenv('CUDA_VISIBLE_DEVICES', None)
- if gpu_list is not None:
- return len(gpu_list.split(','))
- else:
- devices_list_info = os.popen("nvidia-smi -L")
- devices_list_info = devices_list_info.read().strip().split("\n")
- return len(devices_list_info)
-
+__DEFAULT_GLOO_GROUP = None
+xm = get_xla_model()
@contextmanager
def wait_for_the_master(local_rank: int = None):
@@ -61,7 +51,7 @@ def wait_for_the_master(local_rank: int = None):
local_rank = get_local_rank()
if local_rank > 0:
- dist.barrier()
+ barrier()
yield
if local_rank == 0:
if not dist.is_available():
@@ -69,10 +59,10 @@ def wait_for_the_master(local_rank: int = None):
if not dist.is_initialized():
return
else:
- dist.barrier()
+ barrier()
-def synchronize():
+def synchronize(group=None):
"""
Helper function to synchronize (barrier) among all processes when using distributed training
"""
@@ -80,10 +70,11 @@ def synchronize():
return
if not dist.is_initialized():
return
- world_size = dist.get_world_size()
+ world_size = dist.get_world_size(group=group)
if world_size == 1:
return
- dist.barrier()
+
+ barrier(group=group)
def get_world_size() -> int:
@@ -105,16 +96,9 @@ def get_rank() -> int:
def get_local_rank() -> int:
"""
Returns:
- The rank of the current process within the local (per-machine) process group.
+ The rank of the current process within the local machine
"""
- if _LOCAL_PROCESS_GROUP is None:
- return get_rank()
-
- if not dist.is_available():
- return 0
- if not dist.is_initialized():
- return 0
- return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+ return int(os.getenv("LOCAL_RANK", 0))
def get_local_size() -> int:
@@ -122,33 +106,24 @@ def get_local_size() -> int:
Returns:
The size of the per-machine process group, i.e. the number of processes per machine.
"""
- if not dist.is_available():
- return 1
- if not dist.is_initialized():
- return 1
- return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+ return get_local_device_count()
def is_main_process() -> bool:
return get_rank() == 0
-
-@functools.lru_cache()
def _get_global_gloo_group():
"""
Return a process group based on gloo backend, containing all the ranks
The result is cached.
"""
- if dist.get_backend() == "nccl":
- return dist.new_group(backend="gloo")
- else:
- return dist.group.WORLD
-
-
+ global __DEFAULT_GLOO_GROUP
+ assert __DEFAULT_GLOO_GROUP is not None, "Gloo group is not initialized"
+ return __DEFAULT_GLOO_GROUP
+
def _serialize_to_tensor(data, group):
backend = dist.get_backend(group)
- assert backend in ["gloo", "nccl"]
- device = torch.device("cpu" if backend == "gloo" else "cuda")
+ device = torch.device("cpu") if backend == "gloo" else get_current_device()
buffer = pickle.dumps(data)
if len(buffer) > 1024 ** 3:
@@ -211,8 +186,9 @@ def all_gather(data, group=None):
return [data]
tensor = _serialize_to_tensor(data, group)
-
+ synchronize(group=group)
size_list, tensor = _pad_to_largest_tensor(tensor, group)
+ synchronize(group=group)
max_size = max(size_list)
# receiving Tensor from all ranks
@@ -246,14 +222,16 @@ def gather(data, dst=0, group=None):
"""
if get_world_size() == 1:
return [data]
- if group is None:
- group = _get_global_gloo_group()
if dist.get_world_size(group=group) == 1:
return [data]
+ if group is None:
+ group = _get_global_gloo_group()
rank = dist.get_rank(group=group)
tensor = _serialize_to_tensor(data, group)
+ synchronize(group=group)
size_list, tensor = _pad_to_largest_tensor(tensor, group)
+ synchronize(group=group)
# receiving Tensor from all ranks
if rank == dst:
@@ -292,3 +270,34 @@ def time_synchronized():
if torch.cuda.is_available():
torch.cuda.synchronize()
return time.time()
+
+
+def barrier(group=None):
+ dist.barrier(group=group)
+
+def init_distributed(world_size: int, rank: int):
+
+ if not dist.is_initialized():
+ init_method = get_distributed_init_method()
+ backend = get_distributed_backend()
+
+ dist.init_process_group(backend=backend,
+ world_size=world_size,
+ rank=rank,
+ init_method=init_method)
+
+ global __DEFAULT_GLOO_GROUP
+ if __DEFAULT_GLOO_GROUP is None:
+ __DEFAULT_GLOO_GROUP = dist.new_group(backend="gloo")
+
+def deinit_distributed():
+ if dist.is_initialized():
+ global __DEFAULT_GLOO_GROUP
+ try:
+ if __DEFAULT_GLOO_GROUP is not None:
+ dist.destroy_process_group(group=__DEFAULT_GLOO_GROUP)
+ except Exception as e:
+ logger.warning(f"Error: {e}")
+ finally:
+ dist.destroy_process_group()
+
\ No newline at end of file
diff --git a/yolox/utils/metric.py b/yolox/utils/metric.py
index 506b58281..d925aae68 100644
--- a/yolox/utils/metric.py
+++ b/yolox/utils/metric.py
@@ -11,6 +11,8 @@
import torch
+from yolox.utils.device_utils import get_current_device
+
__all__ = [
"AverageMeter",
"MeterBuffer",
@@ -22,6 +24,8 @@
def get_total_and_free_memory_in_Mb(cuda_device):
+ assert torch.cuda.is_available()
+
devices_info_str = os.popen(
"nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader"
)
@@ -37,10 +41,12 @@ def occupy_mem(cuda_device, mem_ratio=0.9):
"""
pre-allocate gpu memory for training to avoid memory Fragmentation.
"""
+ assert torch.cuda.is_available()
+
total, used = get_total_and_free_memory_in_Mb(cuda_device)
max_mem = int(total * mem_ratio)
block_mem = max_mem - used
- x = torch.cuda.FloatTensor(256, 1024, block_mem)
+ x = torch.tensor([256, 1024, block_mem], device=get_current_device())
del x
time.sleep(5)
@@ -49,6 +55,8 @@ def gpu_mem_usage():
"""
Compute the GPU memory usage for the current device (MB).
"""
+ assert torch.cuda.is_available()
+
mem_usage_bytes = torch.cuda.max_memory_allocated()
return mem_usage_bytes / (1024 * 1024)