diff --git a/.gitignore b/.gitignore index 9842565a1..f7d827755 100644 --- a/.gitignore +++ b/.gitignore @@ -226,3 +226,7 @@ events.out.tfevents* .Trashes ehthumbs.db Thumbs.db + +cache +*.out +*.txt diff --git a/Dockerfile.cuda b/Dockerfile.cuda new file mode 100644 index 000000000..0bdfc2d97 --- /dev/null +++ b/Dockerfile.cuda @@ -0,0 +1,19 @@ +FROM nvcr.io/nvidia/pytorch:24.09-py3 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 + +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +COPY . /yolox-x +RUN pip3 install --upgrade pip +RUN pip3 install -v -e /yolox-x +RUN pip3 install opencv-python==4.8.0.74 +WORKDIR /app + +# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug +CMD ["python", "tools/train.py"] \ No newline at end of file diff --git a/Dockerfile.neuron b/Dockerfile.neuron new file mode 100644 index 000000000..eeede27fd --- /dev/null +++ b/Dockerfile.neuron @@ -0,0 +1,21 @@ +FROM public.ecr.aws/neuron/pytorch-training-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 + +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +ENV PJRT_DEVICE=NEURON + +RUN apt-get update && apt-get -y install python3-opencv +COPY . /yolox-x +RUN pip3 install -v -e /yolox-x +RUN pip3 install protobuf==3.20.3 +WORKDIR /app + +# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug +CMD ["python", "tools/train.py"] \ No newline at end of file diff --git a/Dockerfile.xla b/Dockerfile.xla new file mode 100644 index 000000000..4599d53da --- /dev/null +++ b/Dockerfile.xla @@ -0,0 +1,20 @@ +FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.1 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN=true + +# Keeps Python from generating .pyc files in the container +ENV PYTHONDONTWRITEBYTECODE=1 + +# Turns off buffering for easier container logging +ENV PYTHONUNBUFFERED=1 + +ENV PJRT_DEVICE=CUDA + +RUN apt-get update && apt-get -y install python3-opencv +COPY . /yolox-x +RUN pip3 install -v -e /yolox-x +WORKDIR /app + +# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug +CMD ["python", "tools/train.py"] \ No newline at end of file diff --git a/README.md b/README.md index f83aa3c00..a4c1d4c87 100644 --- a/README.md +++ b/README.md @@ -66,11 +66,13 @@ This repo is an implementation of PyTorch version YOLOX, there is also a [MegEng
Installation +Install `torch` version 2.4.0 and `torchvision` with Python 3.10 in a `conda` or virtualenv. Activate the `conda` or `virtualenv`. + Step1. Install YOLOX from source. ```shell -git clone git@github.com:Megvii-BaseDetection/YOLOX.git -cd YOLOX -pip3 install -v -e . # or python3 setup.py develop +git clone https://github.com/ajayvohra2005/YOLOX-x.git +cd YOLOX-x +pip3 install -v -e . ```
@@ -83,79 +85,34 @@ Step1. Download a pretrained model from the benchmark table. Step2. Use either -n or -f to specify your detector's config. For example: ```shell -python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu] +python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result ``` or ```shell -python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu] +python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result ``` Demo for video: ```shell -python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth --path /path/to/your/video --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu] +python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth --path /path/to/your/video --conf 0.25 --nms 0.45 --tsize 640 --save_result ```
-Reproduce our results on COCO - -Step1. Prepare COCO dataset -```shell -cd -ln -s /path/to/your/COCO ./datasets/COCO -``` - -Step2. Reproduce our results on COCO by specifying -n: - -```shell -python -m yolox.tools.train -n yolox-s -d 8 -b 64 --fp16 -o [--cache] - yolox-m - yolox-l - yolox-x -``` -* -d: number of gpu devices -* -b: total batch size, the recommended number for -b is num-gpu * 8 -* --fp16: mixed precision training -* --cache: caching imgs into RAM to accelarate training, which need large system RAM. +Train on COCO + cd YOLOX_HOME +Update `run-cuda.sh` script to set `YOLOX_DATADIR` to your datasets directory, containing `COCO` folder with COCO2017 dataset. Update model name (default `yolox-s`) as needed. -When using -f, the above commands are equivalent to: -```shell -python -m yolox.tools.train -f exps/default/yolox_s.py -d 8 -b 64 --fp16 -o [--cache] - exps/default/yolox_m.py - exps/default/yolox_l.py - exps/default/yolox_x.py -``` - -**Multi Machine Training** + ./run-cuda.sh -We also support multi-nodes training. Just add the following args: -* --num\_machines: num of your total training nodes -* --machine\_rank: specify the rank of each node - -Suppose you want to train YOLOX on 2 machines, and your master machines's IP is 123.123.123.123, use port 12312 and TCP. - -On master machine, run -```shell -python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 0 -``` -On the second machine, run -```shell -python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 1 -``` **Logging to Weights & Biases** To log metrics, predictions and model checkpoints to [W&B](https://docs.wandb.ai/guides/integrations/other/yolox) use the command line argument `--logger wandb` and use the prefix "wandb-" to specify arguments for initializing the wandb run. -```shell -python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o [--cache] --logger wandb wandb-project - yolox-m - yolox-l - yolox-x -``` An example wandb dashboard is available [here](https://wandb.ai/manan-goel/yolox-nano/runs/3pzfeom0) @@ -175,7 +132,7 @@ python -m yolox.tools.train --help We support batch testing for fast evaluation: ```shell -python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 64 -d 8 --conf 0.001 [--fp16] [--fuse] +torchrun --standalone --nproc_per_node=8 yolox.tools.eval yolox-s -c yolox_s.pth -b 64 --conf 0.001 [--fp16] [--fuse] yolox-m yolox-l yolox-x @@ -186,7 +143,7 @@ python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 64 -d 8 --conf 0.001 [- To reproduce speed test, we use the following command: ```shell -python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 1 -d 1 --conf 0.001 --fp16 --fuse +python -m yolox.tools.eval -n yolox-s -c yolox_s.pth -b 1 --conf 0.001 --fp16 --fuse yolox-m yolox-l yolox-x diff --git a/demo/MegEngine/python/models/darknet.py b/demo/MegEngine/python/models/darknet.py index 47469aa68..a896e8610 100644 --- a/demo/MegEngine/python/models/darknet.py +++ b/demo/MegEngine/python/models/darknet.py @@ -3,9 +3,11 @@ # Copyright (c) Megvii Inc. All rights reserved. import megengine.module as M +from yolox.utils.device_utils import get_xla_model from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck +xm = get_xla_model() class Darknet(M.Module): # number of blocks from dark2 to dark5. @@ -70,6 +72,10 @@ def make_spp_block(self, filters_list, in_filters): return m def forward(self, x): + + if xm: + xm.mark_step() + outputs = {} x = self.stem(x) outputs["stem"] = x @@ -81,6 +87,10 @@ def forward(self, x): outputs["dark4"] = x x = self.dark5(x) outputs["dark5"] = x + + if xm: + xm.mark_step() + return {k: v for k, v in outputs.items() if k in self.out_features} @@ -140,6 +150,10 @@ def __init__( ) def forward(self, x): + + if xm: + xm.mark_step() + outputs = {} x = self.stem(x) outputs["stem"] = x @@ -151,4 +165,8 @@ def forward(self, x): outputs["dark4"] = x x = self.dark5(x) outputs["dark5"] = x + + if xm: + xm.mark_step() + return {k: v for k, v in outputs.items() if k in self.out_features} diff --git a/demo/MegEngine/python/models/yolo_fpn.py b/demo/MegEngine/python/models/yolo_fpn.py index 675a7f6e6..af6c2727c 100644 --- a/demo/MegEngine/python/models/yolo_fpn.py +++ b/demo/MegEngine/python/models/yolo_fpn.py @@ -4,11 +4,14 @@ import megengine.functional as F import megengine.module as M +from yolox.utils.device_utils import get_xla_model from .darknet import Darknet from .network_blocks import BaseConv, UpSample +xm = get_xla_model() + class YOLOFPN(M.Module): """ YOLOFPN module. Darknet 53 is the default backbone of this model. @@ -59,6 +62,9 @@ def forward(self, inputs): Tuple[Tensor]: FPN output features.. """ # backbone + if xm: + xm.mark_step() + out_features = self.backbone(inputs) x2, x1, x0 = [out_features[f] for f in self.in_features] @@ -75,4 +81,8 @@ def forward(self, inputs): out_dark3 = self.out2(x2_in) outputs = (out_dark3, out_dark4, x0) + + if xm: + xm.mark_step() + return outputs diff --git a/demo/MegEngine/python/models/yolo_head.py b/demo/MegEngine/python/models/yolo_head.py index 7bba674d5..9c7b37d43 100644 --- a/demo/MegEngine/python/models/yolo_head.py +++ b/demo/MegEngine/python/models/yolo_head.py @@ -4,6 +4,7 @@ import megengine.functional as F import megengine.module as M +from yolox.utils.device_utils import parse_dtype from .network_blocks import BaseConv, DWConv @@ -154,14 +155,16 @@ def forward(self, xin, labels=None, imgs=None): return outputs def get_output_and_grid(self, output, k, stride, dtype): - grid = self.grids[k] + device, dtype = parse_dtype(dtype) + grid = self.grids[k] + batch_size = output.shape[0] n_ch = 5 + self.num_classes hsize, wsize = output.shape[-2:] if grid.shape[2:4] != output.shape[2:4]: yv, xv = meshgrid([F.arange(hsize), F.arange(wsize)]) - grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2).type(dtype) + grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2).to(device=device, dtype=dtype) self.grids[k] = grid output = output.view(batch_size, self.n_anchors, n_ch, hsize, wsize) diff --git a/demo/MegEngine/python/models/yolo_pafpn.py b/demo/MegEngine/python/models/yolo_pafpn.py index 86154bfa9..fb6e591d0 100644 --- a/demo/MegEngine/python/models/yolo_pafpn.py +++ b/demo/MegEngine/python/models/yolo_pafpn.py @@ -4,10 +4,12 @@ import megengine.module as M import megengine.functional as F +from yolox.utils.device_utils import get_xla_model from .darknet import CSPDarknet from .network_blocks import BaseConv, CSPLayer, DWConv, UpSample +xm = get_xla_model() class YOLOPAFPN(M.Module): """ @@ -85,6 +87,10 @@ def forward(self, input): """ # backbone + + if xm: + xm.mark_step() + out_features = self.backbone(input) features = [out_features[f] for f in self.in_features] [x2, x1, x0] = features @@ -108,4 +114,8 @@ def forward(self, input): pan_out0 = self.C3_n4(p_out0) # 1024->1024/32 outputs = (pan_out2, pan_out1, pan_out0) + + if xm: + xm.mark_step() + return outputs diff --git a/demo/nebullvm/nebullvm_optimization.py b/demo/nebullvm/nebullvm_optimization.py index b817baf62..b9365dad2 100644 --- a/demo/nebullvm/nebullvm_optimization.py +++ b/demo/nebullvm/nebullvm_optimization.py @@ -1,18 +1,19 @@ +from yolox.utils.device_utils import get_current_device import torch import time from nebullvm.api.functions import optimize_model # Install DL compilers from yolox.exp import get_exp +device = get_current_device() + # Get YOLO model exp = get_exp(None, 'yolox-s') # select model name model = exp.get_model() -model.cuda() +model.to(device=device) model.eval() -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - # Create dummy data for the optimizer -input_data = [((torch.randn(1, 3, 640, 640).to(device), ), 0) for i in range(100)] +input_data = [((torch.randn(1, 3, 640, 640).to(device=device), ), 0) for i in range(100)] # ---------- Optimization ---------- optimized_model = optimize_model(model, input_data=input_data, optimization_time="constrained") # Optimization without performance loss @@ -22,7 +23,7 @@ # Select image to test the latency of the optimized model # Create dummy image -img = torch.randn(1, 3, 640, 640).to(device) +img = torch.randn(1, 3, 640, 640).to(device=device) # Check perfomance warmup_iters = 30 diff --git a/docker-cuda.sh b/docker-cuda.sh new file mode 100755 index 000000000..8272e2e15 --- /dev/null +++ b/docker-cuda.sh @@ -0,0 +1 @@ +docker run -t -d -v /home/ubuntu/efs/datasets:/datasets -v /home/ubuntu/efs/git/YOLOX-x:/app --shm-size=16g --net=host --gpus all docker.io/library/yolox-x-cuda:latest sleep infinity \ No newline at end of file diff --git a/docker-neuron.sh b/docker-neuron.sh new file mode 100755 index 000000000..52aa59761 --- /dev/null +++ b/docker-neuron.sh @@ -0,0 +1,24 @@ +docker run -t -d \ + -v /home/ubuntu/efs/datasets:/datasets \ + -v /home/ubuntu/efs/git/YOLOX-x:/app \ + -v /tmp:/cache \ + --shm-size=16g \ + --net=host \ + --shm-size=16g \ + --device=/dev/neuron0 \ + --device=/dev/neuron1 \ + --device=/dev/neuron2 \ + --device=/dev/neuron3 \ + --device=/dev/neuron4 \ + --device=/dev/neuron5 \ + --device=/dev/neuron6 \ + --device=/dev/neuron7 \ + --device=/dev/neuron8 \ + --device=/dev/neuron9 \ + --device=/dev/neuron10 \ + --device=/dev/neuron11 \ + --device=/dev/neuron12 \ + --device=/dev/neuron13 \ + --device=/dev/neuron14 \ + --device=/dev/neuron15 \ + docker.io/library/yolox-x-neuron:latest sleep infinity \ No newline at end of file diff --git a/docker-xla-cuda.sh b/docker-xla-cuda.sh new file mode 100755 index 000000000..56cb32a94 --- /dev/null +++ b/docker-xla-cuda.sh @@ -0,0 +1 @@ +docker run -t -d -v /home/ubuntu/efs/datasets:/datasets -v /home/ubuntu/efs/git/YOLOX-x:/app --shm-size=16g --net=host --gpus all docker.io/library/yolox-x-xla:latest sleep infinity \ No newline at end of file diff --git a/run-cuda.sh b/run-cuda.sh new file mode 100755 index 000000000..6ebef065d --- /dev/null +++ b/run-cuda.sh @@ -0,0 +1,5 @@ +export OMP_NUM_THREADS=16 +export LOGURU_LEVEL="INFO" +export YOLOX_DATADIR=/datasets +export YOLOX_OUPUT_DIR="./YOLOX_cuda_outputs" +torchrun --standalone --nproc_per_node=8 tools/train.py -b 32 -n yolox-s diff --git a/run-neuron.sh b/run-neuron.sh new file mode 100755 index 000000000..66d3a4978 --- /dev/null +++ b/run-neuron.sh @@ -0,0 +1,12 @@ +export YOLOX_DATADIR=/datasets +export YOLOX_OUPUT_DIR="./YOLOX_neuron_outputs" +export OMP_NUM_THREADS=16 +export LOGURU_LEVEL="INFO" +export NEURON_CC_FLAGS="--cache_dir=/cache --model-type=generic" +export NEURON_RT_STOCHASTIC_ROUNDING_EN="1" +export XLA_IR_SHAPE_CACHE_SIZE="20480" +export XLA_IR_DEBUG=0 +export XLA_HLO_DEBUG=0 +export PT_XLA_DEBUG=0 +export PT_XLA_DEBUG_FILE=./pt_xla_debug.txt +torchrun --standalone --nproc_per_node=32 tools/train.py -b 128 -n yolox-s diff --git a/run-xla-cuda.sh b/run-xla-cuda.sh new file mode 100755 index 000000000..05a857fbd --- /dev/null +++ b/run-xla-cuda.sh @@ -0,0 +1,9 @@ +export YOLOX_DATADIR=/datasets +export YOLOX_OUPUT_DIR="./YOLOX_xla_cuda_outputs" +export OMP_NUM_THREADS=16 +export LOGURU_LEVEL=INFO +#export XLA_IR_DEBUG=1 +#export XLA_HLO_DEBUG=1 +#export PT_XLA_DEBUG=1 +#export PT_XLA_DEBUG_FILE=./pt_xla_debug.txt +torchrun --standalone --nproc_per_node=8 tools/train.py -b 32 -n yolox-s diff --git a/setup.py b/setup.py index 5fec79764..b523dc0d8 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ def get_long_description(): def get_ext_modules(): ext_module = [] - if sys.platform != "win32": # pre-compile ops on linux + if torch.cuda.is_available() and sys.platform != "win32": # pre-compile ops on linux assert TORCH_AVAILABLE, "torch is required for pre-compiling ops, please install it first." # if any other op is added, please also add it here from yolox.layers import FastCOCOEvalOp @@ -64,8 +64,8 @@ def get_cmd_class(): setuptools.setup( name="yolox", version=get_yolox_version(), - author="megvii basedet team", - url="https://github.com/Megvii-BaseDetection/YOLOX", + author="ajayvohra2005", + url="https://github.com/ajayvohra2005/YOLOX-x", package_dir=get_package_dir(), packages=setuptools.find_packages(exclude=("tests", "tools")) + list(get_package_dir().keys()), python_requires=">=3.6", @@ -82,7 +82,7 @@ def get_cmd_class(): ], project_urls={ "Documentation": "https://yolox.readthedocs.io", - "Source": "https://github.com/Megvii-BaseDetection/YOLOX", - "Tracker": "https://github.com/Megvii-BaseDetection/YOLOX/issues", + "Source": "https://github.com/ajayvohra2005/YOLOX-x", + "Tracker": "https://github.com/ajayvohra2005/YOLOX-x/issues", }, ) diff --git a/tools/demo.py b/tools/demo.py index b16598d5f..d1ea45e3f 100644 --- a/tools/demo.py +++ b/tools/demo.py @@ -9,6 +9,7 @@ import cv2 +from yolox.utils.device_utils import get_current_device import torch from yolox.data.data_augment import ValTransform @@ -46,12 +47,6 @@ def make_parser(): help="please input your experiment description file", ) parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval") - parser.add_argument( - "--device", - default="cpu", - type=str, - help="device to run our model, can either be cpu or gpu", - ) parser.add_argument("--conf", default=0.3, type=float, help="test conf") parser.add_argument("--nms", default=0.3, type=float, help="test nms threshold") parser.add_argument("--tsize", default=None, type=int, help="test img size") @@ -105,7 +100,6 @@ def __init__( cls_names=COCO_CLASSES, trt_file=None, decoder=None, - device="cpu", fp16=False, legacy=False, ): @@ -116,7 +110,6 @@ def __init__( self.confthre = exp.test_conf self.nmsthre = exp.nmsthre self.test_size = exp.test_size - self.device = device self.fp16 = fp16 self.preproc = ValTransform(legacy=legacy) if trt_file is not None: @@ -125,7 +118,7 @@ def __init__( model_trt = TRTModule() model_trt.load_state_dict(torch.load(trt_file)) - x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda() + x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).to(device=get_current_device()) self.model(x) self.model = model_trt @@ -148,10 +141,10 @@ def inference(self, img): img, _ = self.preproc(img, None, self.test_size) img = torch.from_numpy(img).unsqueeze(0) img = img.float() - if self.device == "gpu": - img = img.cuda() - if self.fp16: - img = img.half() # to FP16 + + img = img.to(device=get_current_device()) + if self.fp16: + img = img.half() # to FP16 with torch.no_grad(): t0 = time.time() @@ -253,9 +246,6 @@ def main(exp, args): vis_folder = os.path.join(file_name, "vis_res") os.makedirs(vis_folder, exist_ok=True) - if args.trt: - args.device = "gpu" - logger.info("Args: {}".format(args)) if args.conf is not None: @@ -268,10 +258,10 @@ def main(exp, args): model = exp.get_model() logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size))) - if args.device == "gpu": - model.cuda() - if args.fp16: - model.half() # to FP16 + + model.to(device=get_current_device()) + if args.fp16: + model.half() # to FP16 model.eval() if not args.trt: @@ -304,7 +294,7 @@ def main(exp, args): predictor = Predictor( model, exp, COCO_CLASSES, trt_file, decoder, - args.device, args.fp16, args.legacy, + args.fp16, args.legacy, ) current_time = time.localtime() if args.demo == "image": diff --git a/tools/eval.py b/tools/eval.py index 83ad76be8..e11b3864f 100644 --- a/tools/eval.py +++ b/tools/eval.py @@ -8,8 +8,9 @@ import warnings from loguru import logger +from yolox.utils.device_utils import get_current_device, get_xla_model, set_manual_seed import torch -import torch.backends.cudnn as cudnn + from torch.nn.parallel import DistributedDataParallel as DDP from yolox.core import launch @@ -23,6 +24,7 @@ setup_logger ) +xm = get_xla_model() def make_parser(): parser = argparse.ArgumentParser("YOLOX Eval") @@ -30,25 +32,7 @@ def make_parser(): parser.add_argument("-n", "--name", type=str, default=None, help="model name") # distributed - parser.add_argument( - "--dist-backend", default="nccl", type=str, help="distributed backend" - ) - parser.add_argument( - "--dist-url", - default=None, - type=str, - help="url used to set up distributed training", - ) parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size") - parser.add_argument( - "-d", "--devices", default=None, type=int, help="device for training" - ) - parser.add_argument( - "--num_machines", default=1, type=int, help="num of node for training" - ) - parser.add_argument( - "--machine_rank", default=0, type=int, help="node rank for multi-node training" - ) parser.add_argument( "-f", "--exp_file", @@ -113,20 +97,23 @@ def make_parser(): @logger.catch -def main(exp, args, num_gpu): +def main(exp, args): + assert (not args.trt or torch.cuda.is_available()), "--trt requires CUDA" + if args.seed is not None: - random.seed(args.seed) - torch.manual_seed(args.seed) - cudnn.deterministic = True - warnings.warn( - "You have chosen to seed testing. This will turn on the CUDNN deterministic setting, " - ) + set_manual_seed(args.seed) + if torch.cuda.is_available(): + torch.backends.cudnn.deterministic = True + warnings.warn( + "You have chosen to seed testing. This will turn on the CUDNN deterministic setting, " + ) - is_distributed = num_gpu > 1 + is_distributed = torch.distributed.is_initialized() - # set environment variables for distributed training - configure_nccl() - cudnn.benchmark = True + # set environment variables for distributed training for CUDA + if torch.cuda.is_available(): + configure_nccl() + torch.backends.cudnn.benchmark = True rank = get_local_rank() @@ -153,8 +140,7 @@ def main(exp, args, num_gpu): evaluator.per_class_AP = True evaluator.per_class_AR = True - torch.cuda.set_device(rank) - model.cuda(rank) + model.to(device=get_current_device()) model.eval() if not args.speed and not args.trt: @@ -163,13 +149,17 @@ def main(exp, args, num_gpu): else: ckpt_file = args.ckpt logger.info("loading checkpoint from {}".format(ckpt_file)) - loc = "cuda:{}".format(rank) + loc = get_current_device() ckpt = torch.load(ckpt_file, map_location=loc) model.load_state_dict(ckpt["model"]) logger.info("loaded checkpoint done.") if is_distributed: - model = DDP(model, device_ids=[rank]) + if xm: + xm.mark_step() + model = DDP(model, gradient_as_bucket_view=True) + else: + model = DDP(model) if args.fuse: logger.info("\tFusing model...") @@ -205,16 +195,4 @@ def main(exp, args, num_gpu): if not args.experiment_name: args.experiment_name = exp.exp_name - num_gpu = torch.cuda.device_count() if args.devices is None else args.devices - assert num_gpu <= torch.cuda.device_count() - - dist_url = "auto" if args.dist_url is None else args.dist_url - launch( - main, - num_gpu, - args.num_machines, - args.machine_rank, - backend=args.dist_backend, - dist_url=dist_url, - args=(exp, args, num_gpu), - ) + launch(main,args=(exp, args)) diff --git a/tools/train.py b/tools/train.py index aa98bba30..8dd39946f 100644 --- a/tools/train.py +++ b/tools/train.py @@ -3,16 +3,15 @@ # Copyright (c) Megvii, Inc. and its affiliates. import argparse -import random import warnings from loguru import logger import torch -import torch.backends.cudnn as cudnn from yolox.core import launch from yolox.exp import Exp, check_exp_value, get_exp -from yolox.utils import configure_module, configure_nccl, configure_omp, get_num_devices +from yolox.utils import configure_module, configure_nccl, configure_omp +from yolox.utils.device_utils import set_manual_seed def make_parser(): @@ -21,19 +20,7 @@ def make_parser(): parser.add_argument("-n", "--name", type=str, default=None, help="model name") # distributed - parser.add_argument( - "--dist-backend", default="nccl", type=str, help="distributed backend" - ) - parser.add_argument( - "--dist-url", - default=None, - type=str, - help="url used to set up distributed training", - ) parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size") - parser.add_argument( - "-d", "--devices", default=None, type=int, help="device for training" - ) parser.add_argument( "-f", "--exp_file", @@ -52,12 +39,6 @@ def make_parser(): type=int, help="resume training start epoch", ) - parser.add_argument( - "--num_machines", default=1, type=int, help="num of node for training" - ) - parser.add_argument( - "--machine_rank", default=0, type=int, help="node rank for multi-node training" - ) parser.add_argument( "--fp16", dest="fp16", @@ -99,20 +80,24 @@ def make_parser(): @logger.catch def main(exp: Exp, args): + + assert (not args.occupy or torch.cuda.is_available()), "--occupy requires CUDA" + if exp.seed is not None: - random.seed(exp.seed) - torch.manual_seed(exp.seed) - cudnn.deterministic = True - warnings.warn( - "You have chosen to seed training. This will turn on the CUDNN deterministic setting, " - "which can slow down your training considerably! You may see unexpected behavior " - "when restarting from checkpoints." - ) - - # set environment variables for distributed training - configure_nccl() - configure_omp() - cudnn.benchmark = True + set_manual_seed(exp.seed) + if torch.cuda.is_available(): + torch.backends.cudnn.deterministic = True + warnings.warn( + "You have chosen to seed training. This will turn on the CUDNN deterministic setting, " + "which can slow down your training considerably! You may see unexpected behavior " + "when restarting from checkpoints." + ) + + # set environment variables for distributed training for CUDA + if torch.cuda.is_available(): + configure_nccl() + configure_omp() + torch.backends.cudnn.benchmark = True trainer = exp.get_trainer(args) trainer.train() @@ -128,19 +113,8 @@ def main(exp: Exp, args): if not args.experiment_name: args.experiment_name = exp.exp_name - num_gpu = get_num_devices() if args.devices is None else args.devices - assert num_gpu <= get_num_devices() - if args.cache is not None: + logger.info(f"Dataset cache: {args.cache}; loading dataset before launch") exp.dataset = exp.get_dataset(cache=True, cache_type=args.cache) - dist_url = "auto" if args.dist_url is None else args.dist_url - launch( - main, - num_gpu, - args.num_machines, - args.machine_rank, - backend=args.dist_backend, - dist_url=dist_url, - args=(exp, args), - ) + launch(main,args=(exp, args)) diff --git a/tools/trt.py b/tools/trt.py index f2f6cee5c..b4b5f1cff 100644 --- a/tools/trt.py +++ b/tools/trt.py @@ -7,8 +7,10 @@ import shutil from loguru import logger -import tensorrt as trt import torch +import tensorrt as trt +from yolox.utils.device_utils import get_current_device + from torch2trt import torch2trt from yolox.exp import get_exp @@ -56,9 +58,9 @@ def main(): model.load_state_dict(ckpt["model"]) logger.info("loaded checkpoint done.") model.eval() - model.cuda() + model.to(device=get_current_device()) model.head.decode_in_inference = False - x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda() + x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).to(device=get_current_device()) model_trt = torch2trt( model, [x], diff --git a/tools/visualize_assign.py b/tools/visualize_assign.py index e75a5586b..2b9bdcfc4 100644 --- a/tools/visualize_assign.py +++ b/tools/visualize_assign.py @@ -8,6 +8,7 @@ import warnings from loguru import logger +from yolox.utils.device_utils import get_current_device_type import torch import torch.backends.cudnn as cudnn @@ -35,7 +36,7 @@ def train_one_iter(self): inps, targets = self.exp.preprocess(inps, targets, self.input_size) data_end_time = time.time() - with torch.cuda.amp.autocast(enabled=self.amp_training): + with torch.autocast(get_current_device_type(), enabled=self.amp_training): path_prefix = os.path.join(self.vis_dir, f"assign_vis_{self.batch_cnt}_") self.model.visualize(inps, targets, path_prefix) diff --git a/yolox/core/launch.py b/yolox/core/launch.py index 9f8eec61e..461d3165e 100644 --- a/yolox/core/launch.py +++ b/yolox/core/launch.py @@ -5,143 +5,32 @@ # Copyright (c) Facebook, Inc. and its affiliates. # Copyright (c) Megvii, Inc. and its affiliates. -import sys -from datetime import timedelta -from loguru import logger +import os -import torch -import torch.distributed as dist -import torch.multiprocessing as mp +from yolox.utils.dist import barrier, deinit_distributed, init_distributed -import yolox.utils.dist as comm __all__ = ["launch"] -DEFAULT_TIMEOUT = timedelta(minutes=30) - - -def _find_free_port(): - """ - Find an available port of current machine / node. - """ - import socket - - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - # Binding to port 0 will cause the OS to find an available port for us - sock.bind(("", 0)) - port = sock.getsockname()[1] - sock.close() - # NOTE: there is still a chance the port could be taken by other processes. - return port - - def launch( main_func, - num_gpus_per_machine, - num_machines=1, - machine_rank=0, - backend="nccl", - dist_url=None, - args=(), - timeout=DEFAULT_TIMEOUT, + args=() ): """ Args: main_func: a function that will be called by `main_func(*args)` - num_machines (int): the total number of machines - machine_rank (int): the rank of this machine (one per machine) - dist_url (str): url to connect to for distributed training, including protocol - e.g. "tcp://127.0.0.1:8686". - Can be set to auto to automatically select a free port on localhost args (tuple): arguments passed to main_func """ - world_size = num_machines * num_gpus_per_machine + world_size = int(os.getenv("WORLD_SIZE", 1)) + rank = int(os.getenv("RANK", 0)) if world_size > 1: - # https://github.com/pytorch/pytorch/pull/14391 - # TODO prctl in spawned processes - - if dist_url == "auto": - assert ( - num_machines == 1 - ), "dist_url=auto cannot work with distributed training." - port = _find_free_port() - dist_url = f"tcp://127.0.0.1:{port}" - - start_method = "spawn" - cache = vars(args[1]).get("cache", False) - - # To use numpy memmap for caching image into RAM, we have to use fork method - if cache: - assert sys.platform != "win32", ( - "As Windows platform doesn't support fork method, " - "do not add --cache in your training command." - ) - start_method = "fork" + init_distributed(world_size=world_size, rank=rank) - mp.start_processes( - _distributed_worker, - nprocs=num_gpus_per_machine, - args=( - main_func, - world_size, - num_gpus_per_machine, - machine_rank, - backend, - dist_url, - args, - ), - daemon=False, - start_method=start_method, - ) - else: + barrier() main_func(*args) - - -def _distributed_worker( - local_rank, - main_func, - world_size, - num_gpus_per_machine, - machine_rank, - backend, - dist_url, - args, - timeout=DEFAULT_TIMEOUT, -): - assert ( - torch.cuda.is_available() - ), "cuda is not available. Please check your installation." - global_rank = machine_rank * num_gpus_per_machine + local_rank - logger.info("Rank {} initialization finished.".format(global_rank)) - try: - dist.init_process_group( - backend=backend, - init_method=dist_url, - world_size=world_size, - rank=global_rank, - timeout=timeout, - ) - except Exception: - logger.error("Process group URL: {}".format(dist_url)) - raise - - # Setup the local process group (which contains ranks within the same machine) - assert comm._LOCAL_PROCESS_GROUP is None - num_machines = world_size // num_gpus_per_machine - for i in range(num_machines): - ranks_on_i = list( - range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine) - ) - pg = dist.new_group(ranks_on_i) - if i == machine_rank: - comm._LOCAL_PROCESS_GROUP = pg - - # synchronize is needed here to prevent a possible timeout after calling init_process_group - # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 - comm.synchronize() - - assert num_gpus_per_machine <= torch.cuda.device_count() - torch.cuda.set_device(local_rank) - - main_func(*args) + barrier() + + deinit_distributed() + else: + main_func(*args) \ No newline at end of file diff --git a/yolox/core/trainer.py b/yolox/core/trainer.py index 8f8016e57..444ec8fd4 100644 --- a/yolox/core/trainer.py +++ b/yolox/core/trainer.py @@ -6,6 +6,7 @@ import time from loguru import logger +from yolox.utils.device_utils import get_current_device, get_current_device_type, get_xla_model import torch from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter @@ -34,6 +35,8 @@ ) +xm = get_xla_model() + class Trainer: def __init__(self, exp: Exp, args): # init function only defines some basic attr, other attrs like model, optimizer are built in @@ -44,11 +47,20 @@ def __init__(self, exp: Exp, args): # training related attr self.max_epoch = exp.max_epoch self.amp_training = args.fp16 - self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16) + if hasattr(torch, "GradScaler"): + self.scaler = torch.GradScaler(get_current_device_type(), enabled=args.fp16) + elif xm: + from torch_xla.amp import GradScaler + self.scaler = GradScaler(enabled=args.fp16) + elif torch.cuda.is_available(): + self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16) + else: + self.scaler = torch.cpu.amp.GradScaler(enabled=args.fp16) + self.is_distributed = get_world_size() > 1 self.rank = get_rank() self.local_rank = get_local_rank() - self.device = "cuda:{}".format(self.local_rank) + self.device = get_current_device() self.use_model_ema = exp.ema self.save_history_ckpt = exp.save_history_ckpt @@ -94,25 +106,39 @@ def train_in_iter(self): self.after_iter() def train_one_iter(self): + iter_start_time = time.time() - + logger.debug(f"iter start: {time.time()}") inps, targets = self.prefetcher.next() inps = inps.to(self.data_type) targets = targets.to(self.data_type) targets.requires_grad = False inps, targets = self.exp.preprocess(inps, targets, self.input_size) data_end_time = time.time() - - with torch.cuda.amp.autocast(enabled=self.amp_training): + logger.debug(f"input ready: {data_end_time}") + + if xm: + inps = inps.to(device=self.device) + targets = targets.to(device=self.device) + logger.debug(f"input shape: {inps.shape}") + + logger.debug(f"forward: {time.time()}") + with torch.autocast(get_current_device_type(), enabled=self.amp_training): outputs = self.model(inps, targets) loss = outputs["total_loss"] - + if xm: + loss = loss.to(device=self.device) self.optimizer.zero_grad() - self.scaler.scale(loss).backward() + scaled_loss = self.scaler.scale(loss) + logger.debug(f"backward: {time.time()}") + scaled_loss.backward() self.scaler.step(self.optimizer) self.scaler.update() - + if xm: + xm.mark_step() + logger.debug(f"optimizer step: {time.time()}") + if self.use_model_ema: self.ema_model.update(self.model) @@ -127,13 +153,17 @@ def train_one_iter(self): lr=lr, **outputs, ) + if xm: + xm.mark_step() + logger.debug(f"iter end: {time.time()}") + + def before_train(self): logger.info("args: {}".format(self.args)) logger.info("exp value:\n{}".format(self.exp)) # model related init - torch.cuda.set_device(self.local_rank) model = self.exp.get_model() logger.info( "Model Summary: {}".format(get_model_info(model, self.exp.test_size)) @@ -166,7 +196,11 @@ def before_train(self): occupy_mem(self.local_rank) if self.is_distributed: - model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False) + if xm: + xm.mark_step() + model = DDP(model, broadcast_buffers=False, gradient_as_bucket_view=True) + else: + model = DDP(model, broadcast_buffers=False) if self.use_model_ema: self.ema_model = ModelEMA(model, 0.9998) @@ -265,7 +299,10 @@ def after_iter(self): ["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()] ) - mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage()) + if torch.cuda.is_available(): + mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage()) + else: + mem_str = "mem: {:.1f}Gb".format(mem_usage()) logger.info( "{}, {}, {}, {}, lr: {:.3e}".format( @@ -299,7 +336,7 @@ def after_iter(self): self.meter.clear_meters() # random resizing - if (self.progress_in_iter + 1) % 10 == 0: + if (self.progress_in_iter + 1) % self.exp.random_size_interval == 0: self.input_size = self.exp.random_resize( self.train_loader, self.epoch, self.rank, self.is_distributed ) @@ -381,6 +418,7 @@ def evaluate_and_save_model(self): logger.info("\n" + summary) synchronize() + logger.info(f"Save checkpoints start: {time.time()}") self.save_ckpt("last_epoch", update_best_ckpt, ap=ap50_95) if self.save_history_ckpt: self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95) @@ -396,6 +434,8 @@ def evaluate_and_save_model(self): self.mlflow_logger.save_checkpoints(self.args, self.exp, self.file_name, self.epoch, metadata, update_best_ckpt) + logger.info(f"Save checkpoints end: {time.time()}") + def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None): if self.rank == 0: save_model = self.ema_model.ema if self.use_model_ema else self.model diff --git a/yolox/data/data_prefetcher.py b/yolox/data/data_prefetcher.py index a118cf4e4..c9b299a18 100644 --- a/yolox/data/data_prefetcher.py +++ b/yolox/data/data_prefetcher.py @@ -4,7 +4,6 @@ import torch - class DataPrefetcher: """ DataPrefetcher is inspired by code of following file: @@ -15,9 +14,11 @@ class DataPrefetcher: def __init__(self, loader): self.loader = iter(loader) - self.stream = torch.cuda.Stream() - self.input_cuda = self._input_cuda_for_image - self.record_stream = DataPrefetcher._record_stream_for_image + + if torch.cuda.is_available(): + self.stream = torch.cuda.Stream() + self.input_cuda = self._input_cuda_for_image + self.record_stream = DataPrefetcher._record_stream_for_image self.preload() def preload(self): @@ -28,18 +29,26 @@ def preload(self): self.next_target = None return - with torch.cuda.stream(self.stream): - self.input_cuda() - self.next_target = self.next_target.cuda(non_blocking=True) + if torch.cuda.is_available(): + with torch.cuda.stream(self.stream): + self.input_cuda() + self.next_target = self.next_target.cuda(non_blocking=True) + else: + self.next_input = self.next_input + self.next_target = self.next_target def next(self): - torch.cuda.current_stream().wait_stream(self.stream) - input = self.next_input - target = self.next_target - if input is not None: - self.record_stream(input) - if target is not None: - target.record_stream(torch.cuda.current_stream()) + if torch.cuda.is_available(): + torch.cuda.current_stream().wait_stream(self.stream) + input = self.next_input + target = self.next_target + if input is not None: + self.record_stream(input) + if target is not None: + target.record_stream(torch.cuda.current_stream()) + else: + input = self.next_input + target = self.next_target self.preload() return input, target diff --git a/yolox/evaluators/coco_evaluator.py b/yolox/evaluators/coco_evaluator.py index e218c7456..f76a25932 100644 --- a/yolox/evaluators/coco_evaluator.py +++ b/yolox/evaluators/coco_evaluator.py @@ -26,7 +26,10 @@ time_synchronized, xyxy2xywh ) +from yolox.utils.device_utils import get_current_device, get_xla_model +from yolox.utils.dist import _get_global_gloo_group, get_rank +xm = get_xla_model() def per_class_AR_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AR"], colums=6): per_class_AR = {} @@ -132,14 +135,14 @@ def evaluate( summary (sr): summary info of evaluation. """ # TODO half to amp_test - tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor + tensor_type = torch.float16 if half else torch.float32 model = model.eval() if half: model = model.half() ids = [] data_list = [] output_data = defaultdict() - progress_bar = tqdm if is_main_process() else iter + progress_bar = tqdm # if is_main_process() else iter inference_time = 0 nms_time = 0 @@ -155,11 +158,12 @@ def evaluate( model(x) model = model_trt + model = model.to(device=get_current_device()) for cur_iter, (imgs, _, info_imgs, ids) in enumerate( progress_bar(self.dataloader) ): with torch.no_grad(): - imgs = imgs.type(tensor_type) + imgs = imgs.type(tensor_type).to(device=get_current_device()) # skip the last iters since batchsize might be not enough for batch inference is_time_record = cur_iter < len(self.dataloader) - 1 @@ -185,17 +189,23 @@ def evaluate( outputs, info_imgs, ids, return_outputs=True) data_list.extend(data_list_elem) output_data.update(image_wise_data) - - statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples]) + + statistics = torch.tensor([inference_time, nms_time, n_samples], + dtype=torch.float32, + device=get_current_device()) if distributed: # different process/device might have different speed, # to make sure the process will not be stucked, sync func is used here. - synchronize() - data_list = gather(data_list, dst=0) - output_data = gather(output_data, dst=0) + group = _get_global_gloo_group() + synchronize(group=group) + data_list = gather(data_list, dst=0, group=group) + output_data = gather(output_data, dst=0, group=group) data_list = list(itertools.chain(*data_list)) output_data = dict(ChainMap(*output_data)) - torch.distributed.reduce(statistics, dst=0) + if xm: + torch.distributed.all_reduce(statistics) + else: + torch.distributed.reduce(statistics, dst=0) eval_results = self.evaluate_prediction(data_list, statistics) synchronize() @@ -256,7 +266,10 @@ def evaluate_prediction(self, data_dict, statistics): if not is_main_process(): return 0, 0, None - logger.info("Evaluate in main process...") + if xm: + xm.mark_step() + + logger.info(f"Evaluate in main process: data_dict length: {len(data_dict)}, statistics: {statistics}") annType = ["segm", "bbox", "keypoints"] @@ -278,7 +291,7 @@ def evaluate_prediction(self, data_dict, statistics): ) info = time_info + "\n" - + logger.info(f"time_info: {info} {time.time()}") # Evaluate the Dt (detection) json comparing with the ground truth if len(data_dict) > 0: cocoGt = self.dataloader.dataset.coco @@ -289,6 +302,7 @@ def evaluate_prediction(self, data_dict, statistics): else: _, tmp = tempfile.mkstemp() json.dump(data_dict, open(tmp, "w")) + logger.info(f"load eval data: {tmp} {time.time()}") cocoDt = cocoGt.loadRes(tmp) try: from yolox.layers import COCOeval_opt as COCOeval @@ -298,10 +312,13 @@ def evaluate_prediction(self, data_dict, statistics): logger.warning("Use standard COCOeval.") cocoEval = COCOeval(cocoGt, cocoDt, annType[1]) + logger.info(f"evaluate: {time.time()}") cocoEval.evaluate() + logger.info(f"accumulate: {time.time()}") cocoEval.accumulate() redirect_string = io.StringIO() with contextlib.redirect_stdout(redirect_string): + logger.info(f"summarize: {time.time()}") cocoEval.summarize() info += redirect_string.getvalue() cat_ids = list(cocoGt.cats.keys()) @@ -312,6 +329,7 @@ def evaluate_prediction(self, data_dict, statistics): if self.per_class_AR: AR_table = per_class_AR_table(cocoEval, class_names=cat_names) info += "per class AR:\n" + AR_table + "\n" + logger.info(f"info completed: {time.time()}") return cocoEval.stats[0], cocoEval.stats[1], info else: return 0, 0, info diff --git a/yolox/evaluators/voc_evaluator.py b/yolox/evaluators/voc_evaluator.py index 094df3d69..619f35c5b 100644 --- a/yolox/evaluators/voc_evaluator.py +++ b/yolox/evaluators/voc_evaluator.py @@ -14,7 +14,9 @@ import torch from yolox.utils import gather, is_main_process, postprocess, synchronize, time_synchronized +from yolox.utils.device_utils import get_current_device, get_xla_model +xm = get_xla_model() class VOCEvaluator: """ @@ -57,7 +59,7 @@ def evaluate( summary (sr): summary info of evaluation. """ # TODO half to amp_test - tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor + tensor_type = torch.float16 if half else torch.float32 model = model.eval() if half: model = model.half() @@ -81,7 +83,7 @@ def evaluate( for cur_iter, (imgs, _, info_imgs, ids) in enumerate(progress_bar(self.dataloader)): with torch.no_grad(): - imgs = imgs.type(tensor_type) + imgs = imgs.type(tensor_type).to(device=get_current_device()) # skip the last iters since batchsize might be not enough for batch inference is_time_record = cur_iter < len(self.dataloader) - 1 @@ -105,11 +107,16 @@ def evaluate( data_list.update(self.convert_to_voc_format(outputs, info_imgs, ids)) - statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples]) + statistics = torch.tensor([inference_time, nms_time, n_samples], + dtype=torch.float32, + device=get_current_device()) if distributed: data_list = gather(data_list, dst=0) data_list = ChainMap(*data_list) - torch.distributed.reduce(statistics, dst=0) + if xm: + torch.distributed.all_reduce(statistics) + else: + torch.distributed.reduce(statistics, dst=0) eval_results = self.evaluate_prediction(data_list, statistics) synchronize() diff --git a/yolox/exp/base_exp.py b/yolox/exp/base_exp.py index 7ccfec5c2..85157750d 100644 --- a/yolox/exp/base_exp.py +++ b/yolox/exp/base_exp.py @@ -2,6 +2,7 @@ # Copyright (c) Megvii Inc. All rights reserved. import ast +import os import pprint from abc import ABCMeta, abstractmethod from typing import Dict, List, Tuple @@ -17,8 +18,8 @@ class BaseExp(metaclass=ABCMeta): """Basic class for any experiment.""" def __init__(self): - self.seed = None - self.output_dir = "./YOLOX_outputs" + self.seed = 2024 + self.output_dir = os.getenv("YOLOX_OUTPUT_DIR", "./YOLOX_outputs") self.print_interval = 100 self.eval_interval = 10 self.dataset = None diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py index 82e93c21b..e9707979f 100644 --- a/yolox/exp/yolox_base.py +++ b/yolox/exp/yolox_base.py @@ -4,14 +4,18 @@ import os import random +from yolox.utils.device_utils import get_current_device, get_xla_model import torch import torch.distributed as dist import torch.nn as nn +from yolox.utils.dist import barrier, synchronize + from .base_exp import BaseExp __all__ = ["Exp", "check_exp_value"] +xm = get_xla_model() class Exp(BaseExp): def __init__(self): @@ -108,6 +112,8 @@ def __init__(self): # nms threshold self.nmsthre = 0.65 + self.random_size_interval = 10 + def get_model(self): from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead @@ -222,7 +228,9 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: s return train_loader def random_resize(self, data_loader, epoch, rank, is_distributed): - tensor = torch.LongTensor(2).cuda() + + device = get_current_device() + tensor = torch.LongTensor(2).to(device=device) if rank == 0: size_factor = self.input_size[1] * 1.0 / self.input_size[0] @@ -236,7 +244,7 @@ def random_resize(self, data_loader, epoch, rank, is_distributed): tensor[1] = size[1] if is_distributed: - dist.barrier() + barrier() dist.broadcast(tensor, 0) input_size = (tensor[0].item(), tensor[1].item()) diff --git a/yolox/layers/__init__.py b/yolox/layers/__init__.py index fc9cf5138..6c51e080d 100644 --- a/yolox/layers/__init__.py +++ b/yolox/layers/__init__.py @@ -5,7 +5,10 @@ # import torch first to make jit op work without `ImportError of libc10.so` import torch # noqa -from .jit_ops import FastCOCOEvalOp, JitOp +try: + from .jit_ops import FastCOCOEvalOp, JitOp +except ImportError: + pass try: from .fast_coco_eval_api import COCOeval_opt diff --git a/yolox/layers/fast_coco_eval_api.py b/yolox/layers/fast_coco_eval_api.py index 5f3aeb551..a6b9f6b77 100644 --- a/yolox/layers/fast_coco_eval_api.py +++ b/yolox/layers/fast_coco_eval_api.py @@ -11,6 +11,12 @@ import numpy as np from pycocotools.cocoeval import COCOeval +try: + import torch + assert torch.cuda.is_available() +except AssertionError as e: + raise ImportError() + from .jit_ops import FastCOCOEvalOp diff --git a/yolox/layers/jit_ops.py b/yolox/layers/jit_ops.py index 0fdac4de2..0165640fc 100644 --- a/yolox/layers/jit_ops.py +++ b/yolox/layers/jit_ops.py @@ -10,6 +10,11 @@ __all__ = ["JitOp", "FastCOCOEvalOp"] +try: + import torch + assert torch.cuda.is_available() +except AssertionError as e: + raise ImportError() class JitOp: """ diff --git a/yolox/models/build.py b/yolox/models/build.py index 8edc87de9..5344b4bb0 100644 --- a/yolox/models/build.py +++ b/yolox/models/build.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # -*- coding:utf-8 -*- +from yolox.utils.device_utils import get_current_device import torch from torch import nn from torch.hub import load_state_dict_from_url @@ -50,8 +51,7 @@ def create_yolox_model(name: str, pretrained: bool = True, num_classes: int = 80 from yolox.exp import get_exp, Exp if device is None: - device = "cuda:0" if torch.cuda.is_available() else "cpu" - device = torch.device(device) + device = get_current_device() assert name in _CKPT_FULL_PATH or name == "yolox_custom", \ f"user should use one of value in {_CKPT_FULL_PATH.keys()} or \"yolox_custom\"" @@ -75,7 +75,7 @@ def create_yolox_model(name: str, pretrained: bool = True, num_classes: int = 80 ckpt = ckpt["model"] yolox_model.load_state_dict(ckpt) - yolox_model.to(device) + yolox_model.to(device=device) return yolox_model diff --git a/yolox/models/darknet.py b/yolox/models/darknet.py index b3e053f16..0c1bfc634 100644 --- a/yolox/models/darknet.py +++ b/yolox/models/darknet.py @@ -4,8 +4,11 @@ from torch import nn +from yolox.utils.device_utils import get_xla_model + from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck +xm = get_xla_model() class Darknet(nn.Module): # number of blocks from dark2 to dark5. @@ -80,6 +83,10 @@ def make_spp_block(self, filters_list, in_filters): return m def forward(self, x): + + if xm: + xm.mark_step() + outputs = {} x = self.stem(x) outputs["stem"] = x @@ -91,6 +98,10 @@ def forward(self, x): outputs["dark4"] = x x = self.dark5(x) outputs["dark5"] = x + + if xm: + xm.mark_step() + return {k: v for k, v in outputs.items() if k in self.out_features} @@ -165,6 +176,9 @@ def __init__( ) def forward(self, x): + if xm: + xm.mark_step() + outputs = {} x = self.stem(x) outputs["stem"] = x @@ -176,4 +190,8 @@ def forward(self, x): outputs["dark4"] = x x = self.dark5(x) outputs["dark5"] = x + + if xm: + xm.mark_step() + return {k: v for k, v in outputs.items() if k in self.out_features} diff --git a/yolox/models/losses.py b/yolox/models/losses.py index 77b4d8ef7..7c414c76a 100644 --- a/yolox/models/losses.py +++ b/yolox/models/losses.py @@ -2,9 +2,14 @@ # -*- encoding: utf-8 -*- # Copyright (c) Megvii Inc. All rights reserved. +import time +from loguru import logger import torch import torch.nn as nn +from yolox.utils.device_utils import get_xla_model, parse_dtype + +xm = get_xla_model() class IOUloss(nn.Module): def __init__(self, reduction="none", loss_type="iou"): @@ -12,8 +17,8 @@ def __init__(self, reduction="none", loss_type="iou"): self.reduction = reduction self.loss_type = loss_type - def forward(self, pred, target): - assert pred.shape[0] == target.shape[0] + def forward(self, pred, target): + assert pred.shape[0] == target.shape[0], f"pred shape: {pred.shape} target shape: {target.shape}" pred = pred.view(-1, 4) target = target.view(-1, 4) @@ -27,7 +32,8 @@ def forward(self, pred, target): area_p = torch.prod(pred[:, 2:], 1) area_g = torch.prod(target[:, 2:], 1) - en = (tl < br).type(tl.type()).prod(dim=1) + device, dtype = parse_dtype(tl.type()) + en = (tl < br).to(device=device, dtype=dtype).prod(dim=1) area_i = torch.prod(br - tl, 1) * en area_u = area_p + area_g - area_i iou = (area_i) / (area_u + 1e-16) diff --git a/yolox/models/yolo_fpn.py b/yolox/models/yolo_fpn.py index 224271f59..88f06ec84 100644 --- a/yolox/models/yolo_fpn.py +++ b/yolox/models/yolo_fpn.py @@ -5,10 +5,14 @@ import torch import torch.nn as nn +from yolox.utils.device_utils import get_xla_model + from .darknet import Darknet from .network_blocks import BaseConv +xm = get_xla_model() + class YOLOFPN(nn.Module): """ YOLOFPN module. Darknet 53 is the default backbone of this model. @@ -65,6 +69,9 @@ def forward(self, inputs): Tuple[Tensor]: FPN output features.. """ # backbone + if xm: + xm.mark_step() + out_features = self.backbone(inputs) x2, x1, x0 = [out_features[f] for f in self.in_features] @@ -81,4 +88,8 @@ def forward(self, inputs): out_dark3 = self.out2(x2_in) outputs = (out_dark3, out_dark4, x0) + + if xm: + xm.mark_step() + return outputs diff --git a/yolox/models/yolo_head.py b/yolox/models/yolo_head.py index 3e51768ee..d218266ab 100644 --- a/yolox/models/yolo_head.py +++ b/yolox/models/yolo_head.py @@ -3,6 +3,7 @@ # Copyright (c) Megvii Inc. All rights reserved. import math +import time from loguru import logger import torch @@ -10,10 +11,12 @@ import torch.nn.functional as F from yolox.utils import bboxes_iou, cxcywh2xyxy, meshgrid, visualize_assign +from yolox.utils.device_utils import get_current_device, get_current_device_type, get_xla_model, parse_dtype from .losses import IOUloss from .network_blocks import BaseConv, DWConv +xm = get_xla_model() class YOLOXHead(nn.Module): def __init__( @@ -152,7 +155,7 @@ def forward(self, xin, labels=None, imgs=None): x = self.stems[k](x) cls_x = x reg_x = x - + cls_feat = cls_conv(cls_x) cls_output = self.cls_preds[k](cls_feat) @@ -187,12 +190,13 @@ def forward(self, xin, labels=None, imgs=None): output = torch.cat( [reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1 ) + if xm: + xm.mark_step() outputs.append(output) - + if self.training: return self.get_losses( - imgs, x_shifts, y_shifts, expanded_strides, @@ -213,6 +217,7 @@ def forward(self, xin, labels=None, imgs=None): return outputs def get_output_and_grid(self, output, k, stride, dtype): + device, dtype = parse_dtype(dtype) grid = self.grids[k] batch_size = output.shape[0] @@ -220,7 +225,7 @@ def get_output_and_grid(self, output, k, stride, dtype): hsize, wsize = output.shape[-2:] if grid.shape[2:4] != output.shape[2:4]: yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)]) - grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype) + grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).to(device=device, dtype=dtype) self.grids[k] = grid output = output.view(batch_size, 1, n_ch, hsize, wsize) @@ -233,6 +238,8 @@ def get_output_and_grid(self, output, k, stride, dtype): return output, grid def decode_outputs(self, outputs, dtype): + device, dtype = parse_dtype(dtype) + grids = [] strides = [] for (hsize, wsize), stride in zip(self.hw, self.strides): @@ -242,9 +249,9 @@ def decode_outputs(self, outputs, dtype): shape = grid.shape[:2] strides.append(torch.full((*shape, 1), stride)) - grids = torch.cat(grids, dim=1).type(dtype) - strides = torch.cat(strides, dim=1).type(dtype) - + grids = torch.cat(grids, dim=1).to(device=device, dtype=dtype) + strides = torch.cat(strides, dim=1).to(device=device, dtype=dtype) + outputs = torch.cat([ (outputs[..., 0:2] + grids) * strides, torch.exp(outputs[..., 2:4]) * strides, @@ -254,7 +261,6 @@ def decode_outputs(self, outputs, dtype): def get_losses( self, - imgs, x_shifts, y_shifts, expanded_strides, @@ -263,6 +269,16 @@ def get_losses( origin_preds, dtype, ): + if xm: + xm.mark_step() + outputs = outputs.cpu() + labels = labels.cpu() + x_shifts = [ t.cpu() for t in x_shifts] + y_shifts = [ t.cpu() for t in y_shifts] + expanded_strides = [ t.cpu() for t in expanded_strides] + if self.use_l1: + origin_preds = [ t.cpu() for t in origin_preds] + bbox_preds = outputs[:, :, :4] # [batch, n_anchors_all, 4] obj_preds = outputs[:, :, 4:5] # [batch, n_anchors_all, 1] cls_preds = outputs[:, :, 5:] # [batch, n_anchors_all, n_cls] @@ -285,7 +301,7 @@ def get_losses( num_fg = 0.0 num_gts = 0.0 - + for batch_idx in range(outputs.shape[0]): num_gt = int(nlabel[batch_idx]) num_gts += num_gt @@ -299,7 +315,7 @@ def get_losses( gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5] gt_classes = labels[batch_idx, :num_gt, 0] bboxes_preds_per_image = bbox_preds[batch_idx] - + try: ( gt_matched_classes, @@ -318,10 +334,10 @@ def get_losses( y_shifts, cls_preds, obj_preds, + mode=get_current_device_type() ) - except RuntimeError as e: - # TODO: the string might change, consider a better way - if "CUDA out of memory. " not in str(e): + except RuntimeError as e: + if xm or "CUDA out of memory. " not in str(e): raise # RuntimeError might not caused by CUDA OOM logger.error( @@ -329,7 +345,10 @@ def get_losses( CPU mode is applied in this batch. If you want to avoid this issue, \ try to reduce the batch size or image size." ) - torch.cuda.empty_cache() + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + ( gt_matched_classes, fg_mask, @@ -350,7 +369,8 @@ def get_losses( "cpu", ) - torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.empty_cache() num_fg += num_fg_img cls_target = F.one_hot( @@ -432,11 +452,10 @@ def get_assignments( y_shifts, cls_preds, obj_preds, - mode="gpu", - ): - - if mode == "cpu": - print("-----------Using CPU for the Current Batch-------------") + mode=None, + ): + + if mode == "cpu" or mode =="xla": gt_bboxes_per_image = gt_bboxes_per_image.cpu().float() bboxes_preds_per_image = bboxes_preds_per_image.cpu().float() gt_classes = gt_classes.cpu().float() @@ -456,10 +475,10 @@ def get_assignments( obj_preds_ = obj_preds[batch_idx][fg_mask] num_in_boxes_anchor = bboxes_preds_per_image.shape[0] - if mode == "cpu": + if mode == "cpu" or mode =="xla": gt_bboxes_per_image = gt_bboxes_per_image.cpu() bboxes_preds_per_image = bboxes_preds_per_image.cpu() - + pair_wise_ious = bboxes_iou(gt_bboxes_per_image, bboxes_preds_per_image, False) gt_cls_per_image = ( @@ -468,10 +487,10 @@ def get_assignments( ) pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8) - if mode == "cpu": + if mode == "cpu" or mode =="xla": cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu() - with torch.cuda.amp.autocast(enabled=False): + with torch.autocast(get_current_device_type(), enabled=False): cls_preds_ = ( cls_preds_.float().sigmoid_() * obj_preds_.float().sigmoid_() ).sqrt() @@ -496,11 +515,13 @@ def get_assignments( ) = self.simota_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask) del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss + # do not move back for mode == "xla" if mode == "cpu": - gt_matched_classes = gt_matched_classes.cuda() - fg_mask = fg_mask.cuda() - pred_ious_this_matching = pred_ious_this_matching.cuda() - matched_gt_inds = matched_gt_inds.cuda() + device = get_current_device() + gt_matched_classes = gt_matched_classes.to(device=device) + fg_mask = fg_mask.to(device=device) + pred_ious_this_matching = pred_ious_this_matching.to(device=device) + matched_gt_inds = matched_gt_inds.to(device=device) return ( gt_matched_classes, @@ -542,7 +563,7 @@ def get_geometry_constraint( return anchor_filter, geometry_relation def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask): - matching_matrix = torch.zeros_like(cost, dtype=torch.uint8) + matching_matrix = torch.zeros_like(cost, dtype=torch.uint8).to(device=pair_wise_ious.device) n_candidate_k = min(10, pair_wise_ious.size(1)) topk_ious, _ = torch.topk(pair_wise_ious, n_candidate_k, dim=1) @@ -556,7 +577,7 @@ def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask): del topk_ious, dynamic_ks, pos_idx anchor_matching_gt = matching_matrix.sum(0) - # deal with the case that one anchor matches multiple ground-truths + # deal with the case that one anchor matches multiple ground-truths if anchor_matching_gt.max() > 1: multiple_match_mask = anchor_matching_gt > 1 _, cost_argmin = torch.min(cost[:, multiple_match_mask], dim=0) diff --git a/yolox/models/yolo_pafpn.py b/yolox/models/yolo_pafpn.py index 4c4e18a5c..99f5985f1 100644 --- a/yolox/models/yolo_pafpn.py +++ b/yolox/models/yolo_pafpn.py @@ -5,9 +5,12 @@ import torch import torch.nn as nn +from yolox.utils.device_utils import get_xla_model + from .darknet import CSPDarknet from .network_blocks import BaseConv, CSPLayer, DWConv +xm = get_xla_model() class YOLOPAFPN(nn.Module): """ @@ -90,6 +93,9 @@ def forward(self, input): """ # backbone + if xm: + xm.mark_step() + out_features = self.backbone(input) features = [out_features[f] for f in self.in_features] [x2, x1, x0] = features @@ -113,4 +119,8 @@ def forward(self, input): pan_out0 = self.C3_n4(p_out0) # 1024->1024/32 outputs = (pan_out2, pan_out1, pan_out0) + + if xm: + xm.mark_step() + return outputs diff --git a/yolox/models/yolox.py b/yolox/models/yolox.py index 744ceea81..42bc0a034 100644 --- a/yolox/models/yolox.py +++ b/yolox/models/yolox.py @@ -4,9 +4,12 @@ import torch.nn as nn +from yolox.utils.device_utils import get_xla_model + from .yolo_head import YOLOXHead from .yolo_pafpn import YOLOPAFPN +xm = get_xla_model() class YOLOX(nn.Module): """ @@ -27,6 +30,10 @@ def __init__(self, backbone=None, head=None): def forward(self, x, targets=None): # fpn output content features of [dark3, dark4, dark5] + + if xm: + xm.mark_step() + fpn_outs = self.backbone(x) if self.training: @@ -45,6 +52,9 @@ def forward(self, x, targets=None): else: outputs = self.head(fpn_outs) + if xm: + xm.mark_step() + return outputs def visualize(self, x, targets, save_prefix="assign_vis_"): diff --git a/yolox/utils/allreduce_norm.py b/yolox/utils/allreduce_norm.py index 142c76c78..4adf6dc65 100644 --- a/yolox/utils/allreduce_norm.py +++ b/yolox/utils/allreduce_norm.py @@ -5,11 +5,12 @@ import pickle from collections import OrderedDict +from yolox.utils.device_utils import get_current_device import torch from torch import distributed as dist from torch import nn -from .dist import _get_global_gloo_group, get_world_size +from yolox.utils.dist import _get_global_gloo_group, get_world_size ASYNC_NORM = ( nn.BatchNorm1d, @@ -38,7 +39,8 @@ def get_async_norm_states(module): return async_norm_states -def pyobj2tensor(pyobj, device="cuda"): +def pyobj2tensor(pyobj): + device = get_current_device() """serialize picklable python object to tensor""" storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj)) return torch.ByteTensor(storage).to(device=device) @@ -83,7 +85,7 @@ def all_reduce(py_dict, op="sum", group=None): tensor_numels = [py_dict[k].numel() for k in py_key] flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key]) - dist.all_reduce(flatten_tensor, op=_get_reduce_op(op)) + dist.all_reduce(flatten_tensor, op=_get_reduce_op(op), group=group) if op == "mean": flatten_tensor /= world_size diff --git a/yolox/utils/boxes.py b/yolox/utils/boxes.py index f71e8d90b..07b27cf8e 100644 --- a/yolox/utils/boxes.py +++ b/yolox/utils/boxes.py @@ -6,6 +6,8 @@ import torch import torchvision +from yolox.utils.device_utils import parse_dtype + __all__ = [ "filter_box", "postprocess", @@ -97,7 +99,8 @@ def bboxes_iou(bboxes_a, bboxes_b, xyxy=True): area_a = torch.prod(bboxes_a[:, 2:], 1) area_b = torch.prod(bboxes_b[:, 2:], 1) - en = (tl < br).type(tl.type()).prod(dim=2) + device, dtype = parse_dtype(tl.type()) + en = (tl < br).to(device=device, dtype=dtype).prod(dim=2) area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all()) return area_i / (area_a[:, None] + area_b - area_i) diff --git a/yolox/utils/device_utils.py b/yolox/utils/device_utils.py new file mode 100644 index 000000000..ab419d3e2 --- /dev/null +++ b/yolox/utils/device_utils.py @@ -0,0 +1,156 @@ +import os +import random +import warnings + +from typing import Union +import torch + +try: + import torch_xla.core.xla_model as xm + import torch_xla.runtime as xr + import torch_xla.distributed.xla_backend as xb + + compiler_cache_path = os.getenv("XLA_CACHE_DIR", "./cache") + os.makedirs(compiler_cache_path, exist_ok=True) + try: + xr.initialize_cache(compiler_cache_path, readonly=False) + except AttributeError as e: + warnings.warn(f"can not set XLA cache dir: {e}") + +except ImportError: + xm = None + xr = None + xb = None + +def get_xla_model(): + return xm + + +def get_xla_runtime(): + return xr + + +def get_current_device() -> torch.device: + global __current_device + + try: + return __current_device + except NameError: + if xm is not None: + __current_device = xm.xla_device() + elif torch.cuda.is_available(): + local_rank = int(os.getenv("LOCAL_RANK", 0)) + __current_device = torch.device(f'cuda:{local_rank}') + torch.cuda.set_device(__current_device) + else: + device = os.getenv("DEFAULT_DEVICE", "cpu") + __current_device = torch.device(device) + + return __current_device + + +def get_current_device_type() -> str: + global __current_device_type + + try: + return __current_device_type + except NameError: + if xm is not None: + __current_device_type = "xla" + elif torch.cuda.is_available(): + __current_device_type = "cuda" + else: + __current_device_type = os.getenv("DEFAULT_DEVICE_TYPE", "cpu") + + return __current_device_type + + +def get_local_device_count() -> int: + device_count = 1 + + if xr is not None: + device_count = xr.global_device_count() + elif torch.cuda.is_available(): + device_count = torch.cuda.device_count() + + return device_count + + +def get_distributed_backend(backend=None) -> str: + if xm is not None: + backend = "xla" + elif torch.cuda.is_available(): + backend = backend if backend is not None else "nccl" + else: + backend = backend if backend is not None else "gloo" + + return backend + + +def get_distributed_init_method() -> str: + if xm is not None: + init_method = 'xla://' + else: + init_method = "env://" + + return init_method + + +def get_current_rng_state() -> Union[torch.Tensor, int]: + if torch.cuda.is_available(): + rng_state = torch.cuda.get_rng_state(device=get_current_device()) + elif xm: + rng_state = xm.get_rng_state(device=get_current_device()) + else: + rng_state = torch.get_rng_state() + + return rng_state + + +def set_manual_seed(seed: int): + random.seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + elif xm is not None: + xm.set_rng_state(seed, device=get_current_device()) + else: + torch.manual_seed(seed) + + +def set_current_rng_state(new_state): + if torch.cuda.is_available(): + new_state = new_state.type(torch.ByteTensor) + torch.cuda.set_rng_state(new_state, device=get_current_device()) + elif xm is not None: + new_state = int(new_state) + xm.set_rng_state(new_state, device=get_current_device()) + else: + new_state = new_state.type(torch.ByteTensor) + torch.set_rng_state(new_state) + +if xb: + def make_send_channel_id_impl(self, dst_rank, tag): + return int(dst_rank)*2 + + def make_recv_channel_id_impl(self, src_rank, tag): + return int(src_rank)*3 + + xb.ProcessGroupXla.make_send_channel_id = make_send_channel_id_impl + xb.ProcessGroupXla.make_recv_channel_id = make_recv_channel_id_impl + +def parse_dtype(dtype: str): + d, t = dtype.rsplit(".", 1) + + assert d in ['torch', 'torch.cuda', 'torch.xla'] + assert t in [ 'FloatTensor', 'HalfTensor', 'BFloat16Tensor'] + + if t == "FloatTensor": + dtype = torch.float32 + elif t == "HalfTensor": + dtype = torch.float16 + elif t == "BFloat16Tensor": + dtype = torch.bfloat16 + + device = torch.device("cpu") if d == "torch" else get_current_device() + + return device, dtype diff --git a/yolox/utils/dist.py b/yolox/utils/dist.py index 9e8fea933..4160ec5a0 100644 --- a/yolox/utils/dist.py +++ b/yolox/utils/dist.py @@ -20,9 +20,9 @@ import torch from torch import distributed as dist +from yolox.utils.device_utils import get_current_device, get_distributed_backend, get_distributed_init_method, get_local_device_count, get_xla_model, xm __all__ = [ - "get_num_devices", "wait_for_the_master", "is_main_process", "synchronize", @@ -35,18 +35,8 @@ "all_gather", ] -_LOCAL_PROCESS_GROUP = None - - -def get_num_devices(): - gpu_list = os.getenv('CUDA_VISIBLE_DEVICES', None) - if gpu_list is not None: - return len(gpu_list.split(',')) - else: - devices_list_info = os.popen("nvidia-smi -L") - devices_list_info = devices_list_info.read().strip().split("\n") - return len(devices_list_info) - +__DEFAULT_GLOO_GROUP = None +xm = get_xla_model() @contextmanager def wait_for_the_master(local_rank: int = None): @@ -61,7 +51,7 @@ def wait_for_the_master(local_rank: int = None): local_rank = get_local_rank() if local_rank > 0: - dist.barrier() + barrier() yield if local_rank == 0: if not dist.is_available(): @@ -69,10 +59,10 @@ def wait_for_the_master(local_rank: int = None): if not dist.is_initialized(): return else: - dist.barrier() + barrier() -def synchronize(): +def synchronize(group=None): """ Helper function to synchronize (barrier) among all processes when using distributed training """ @@ -80,10 +70,11 @@ def synchronize(): return if not dist.is_initialized(): return - world_size = dist.get_world_size() + world_size = dist.get_world_size(group=group) if world_size == 1: return - dist.barrier() + + barrier(group=group) def get_world_size() -> int: @@ -105,16 +96,9 @@ def get_rank() -> int: def get_local_rank() -> int: """ Returns: - The rank of the current process within the local (per-machine) process group. + The rank of the current process within the local machine """ - if _LOCAL_PROCESS_GROUP is None: - return get_rank() - - if not dist.is_available(): - return 0 - if not dist.is_initialized(): - return 0 - return dist.get_rank(group=_LOCAL_PROCESS_GROUP) + return int(os.getenv("LOCAL_RANK", 0)) def get_local_size() -> int: @@ -122,33 +106,24 @@ def get_local_size() -> int: Returns: The size of the per-machine process group, i.e. the number of processes per machine. """ - if not dist.is_available(): - return 1 - if not dist.is_initialized(): - return 1 - return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) + return get_local_device_count() def is_main_process() -> bool: return get_rank() == 0 - -@functools.lru_cache() def _get_global_gloo_group(): """ Return a process group based on gloo backend, containing all the ranks The result is cached. """ - if dist.get_backend() == "nccl": - return dist.new_group(backend="gloo") - else: - return dist.group.WORLD - - + global __DEFAULT_GLOO_GROUP + assert __DEFAULT_GLOO_GROUP is not None, "Gloo group is not initialized" + return __DEFAULT_GLOO_GROUP + def _serialize_to_tensor(data, group): backend = dist.get_backend(group) - assert backend in ["gloo", "nccl"] - device = torch.device("cpu" if backend == "gloo" else "cuda") + device = torch.device("cpu") if backend == "gloo" else get_current_device() buffer = pickle.dumps(data) if len(buffer) > 1024 ** 3: @@ -211,8 +186,9 @@ def all_gather(data, group=None): return [data] tensor = _serialize_to_tensor(data, group) - + synchronize(group=group) size_list, tensor = _pad_to_largest_tensor(tensor, group) + synchronize(group=group) max_size = max(size_list) # receiving Tensor from all ranks @@ -246,14 +222,16 @@ def gather(data, dst=0, group=None): """ if get_world_size() == 1: return [data] - if group is None: - group = _get_global_gloo_group() if dist.get_world_size(group=group) == 1: return [data] + if group is None: + group = _get_global_gloo_group() rank = dist.get_rank(group=group) tensor = _serialize_to_tensor(data, group) + synchronize(group=group) size_list, tensor = _pad_to_largest_tensor(tensor, group) + synchronize(group=group) # receiving Tensor from all ranks if rank == dst: @@ -292,3 +270,34 @@ def time_synchronized(): if torch.cuda.is_available(): torch.cuda.synchronize() return time.time() + + +def barrier(group=None): + dist.barrier(group=group) + +def init_distributed(world_size: int, rank: int): + + if not dist.is_initialized(): + init_method = get_distributed_init_method() + backend = get_distributed_backend() + + dist.init_process_group(backend=backend, + world_size=world_size, + rank=rank, + init_method=init_method) + + global __DEFAULT_GLOO_GROUP + if __DEFAULT_GLOO_GROUP is None: + __DEFAULT_GLOO_GROUP = dist.new_group(backend="gloo") + +def deinit_distributed(): + if dist.is_initialized(): + global __DEFAULT_GLOO_GROUP + try: + if __DEFAULT_GLOO_GROUP is not None: + dist.destroy_process_group(group=__DEFAULT_GLOO_GROUP) + except Exception as e: + logger.warning(f"Error: {e}") + finally: + dist.destroy_process_group() + \ No newline at end of file diff --git a/yolox/utils/metric.py b/yolox/utils/metric.py index 506b58281..d925aae68 100644 --- a/yolox/utils/metric.py +++ b/yolox/utils/metric.py @@ -11,6 +11,8 @@ import torch +from yolox.utils.device_utils import get_current_device + __all__ = [ "AverageMeter", "MeterBuffer", @@ -22,6 +24,8 @@ def get_total_and_free_memory_in_Mb(cuda_device): + assert torch.cuda.is_available() + devices_info_str = os.popen( "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader" ) @@ -37,10 +41,12 @@ def occupy_mem(cuda_device, mem_ratio=0.9): """ pre-allocate gpu memory for training to avoid memory Fragmentation. """ + assert torch.cuda.is_available() + total, used = get_total_and_free_memory_in_Mb(cuda_device) max_mem = int(total * mem_ratio) block_mem = max_mem - used - x = torch.cuda.FloatTensor(256, 1024, block_mem) + x = torch.tensor([256, 1024, block_mem], device=get_current_device()) del x time.sleep(5) @@ -49,6 +55,8 @@ def gpu_mem_usage(): """ Compute the GPU memory usage for the current device (MB). """ + assert torch.cuda.is_available() + mem_usage_bytes = torch.cuda.max_memory_allocated() return mem_usage_bytes / (1024 * 1024)