diff --git a/.gitignore b/.gitignore
index 9842565a1..f7d827755 100644
--- a/.gitignore
+++ b/.gitignore
@@ -226,3 +226,7 @@ events.out.tfevents*
 .Trashes
 ehthumbs.db
 Thumbs.db
+
+cache
+*.out
+*.txt
diff --git a/Dockerfile.cuda b/Dockerfile.cuda
new file mode 100644
index 000000000..0bdfc2d97
--- /dev/null
+++ b/Dockerfile.cuda
@@ -0,0 +1,19 @@
+FROM nvcr.io/nvidia/pytorch:24.09-py3
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+COPY . /yolox-x
+RUN pip3 install --upgrade pip
+RUN pip3 install -v -e /yolox-x
+RUN pip3 install opencv-python==4.8.0.74
+WORKDIR /app
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD ["python", "tools/train.py"]
\ No newline at end of file
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
new file mode 100644
index 000000000..eeede27fd
--- /dev/null
+++ b/Dockerfile.neuron
@@ -0,0 +1,21 @@
+FROM public.ecr.aws/neuron/pytorch-training-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+ENV PJRT_DEVICE=NEURON
+
+RUN apt-get update && apt-get -y install python3-opencv
+COPY . /yolox-x
+RUN pip3 install -v -e /yolox-x
+RUN pip3 install protobuf==3.20.3
+WORKDIR /app
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD ["python", "tools/train.py"]
\ No newline at end of file
diff --git a/Dockerfile.xla b/Dockerfile.xla
new file mode 100644
index 000000000..4599d53da
--- /dev/null
+++ b/Dockerfile.xla
@@ -0,0 +1,20 @@
+FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.1
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+
+# Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+ENV PJRT_DEVICE=CUDA
+
+RUN apt-get update && apt-get -y install python3-opencv
+COPY . /yolox-x
+RUN pip3 install -v -e /yolox-x
+WORKDIR /app
+
+# During debugging, this entry point will be overridden. For more information, please refer to https://aka.ms/vscode-docker-python-debug
+CMD ["python", "tools/train.py"]
\ No newline at end of file
diff --git a/README.md b/README.md
index f83aa3c00..a4c1d4c87 100644
--- a/README.md
+++ b/README.md
@@ -66,11 +66,13 @@ This repo is an implementation of PyTorch version YOLOX, there is also a [MegEng
 <details>
 <summary>Installation</summary>
 
+Install `torch` version 2.4.0 and `torchvision` with Python 3.10 in a `conda` or virtualenv. Activate the `conda` or `virtualenv`.
+
 Step1. Install YOLOX from source.
 ```shell
-git clone git@github.com:Megvii-BaseDetection/YOLOX.git
-cd YOLOX
-pip3 install -v -e .  # or  python3 setup.py develop
+git clone https://github.com/ajayvohra2005/YOLOX-x.git
+cd YOLOX-x
+pip3 install -v -e . 
 ```
 
 </details>
@@ -83,79 +85,34 @@ Step1. Download a pretrained model from the benchmark table.
 Step2. Use either -n or -f to specify your detector's config. For example:
 
 ```shell
-python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
+python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result
 ```
 or
 ```shell
-python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
+python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result
 ```
 Demo for video:
 ```shell
-python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth --path /path/to/your/video --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]
+python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth --path /path/to/your/video --conf 0.25 --nms 0.45 --tsize 640 --save_result
 ```
 
 
 </details>
 
 <details>
-<summary>Reproduce our results on COCO</summary>
-
-Step1. Prepare COCO dataset
-```shell
-cd <YOLOX_HOME>
-ln -s /path/to/your/COCO ./datasets/COCO
-```
-
-Step2. Reproduce our results on COCO by specifying -n:
-
-```shell
-python -m yolox.tools.train -n yolox-s -d 8 -b 64 --fp16 -o [--cache]
-                               yolox-m
-                               yolox-l
-                               yolox-x
-```
-* -d: number of gpu devices
-* -b: total batch size, the recommended number for -b is num-gpu * 8
-* --fp16: mixed precision training
-* --cache: caching imgs into RAM to accelarate training, which need large system RAM.
+<summary>Train on COCO</summary>
 
+    cd YOLOX_HOME
 
+Update `run-cuda.sh` script  to set `YOLOX_DATADIR` to your datasets directory, containing `COCO` folder with COCO2017 dataset. Update model name (default `yolox-s`) as needed.
 
-When using -f, the above commands are equivalent to:
-```shell
-python -m yolox.tools.train -f exps/default/yolox_s.py -d 8 -b 64 --fp16 -o [--cache]
-                               exps/default/yolox_m.py
-                               exps/default/yolox_l.py
-                               exps/default/yolox_x.py
-```
-
-**Multi Machine Training**
+    ./run-cuda.sh
 
-We also support multi-nodes training. Just add the following args:
-* --num\_machines: num of your total training nodes
-* --machine\_rank: specify the rank of each node
-
-Suppose you want to train YOLOX on 2 machines, and your master machines's IP is 123.123.123.123, use port 12312 and TCP.
-
-On master machine, run
-```shell
-python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 0
-```
-On the second machine, run
-```shell
-python tools/train.py -n yolox-s -b 128 --dist-url tcp://123.123.123.123:12312 --num_machines 2 --machine_rank 1
-```
 
 **Logging to Weights & Biases**
 
 To log metrics, predictions and model checkpoints to [W&B](https://docs.wandb.ai/guides/integrations/other/yolox) use the command line argument `--logger wandb` and use the prefix "wandb-" to specify arguments for initializing the wandb run.
 
-```shell
-python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o [--cache] --logger wandb wandb-project <project name>
-                         yolox-m
-                         yolox-l
-                         yolox-x
-```
 
 An example wandb dashboard is available [here](https://wandb.ai/manan-goel/yolox-nano/runs/3pzfeom0)
 
@@ -175,7 +132,7 @@ python -m yolox.tools.train --help
 We support batch testing for fast evaluation:
 
 ```shell
-python -m yolox.tools.eval -n  yolox-s -c yolox_s.pth -b 64 -d 8 --conf 0.001 [--fp16] [--fuse]
+torchrun --standalone --nproc_per_node=8 yolox.tools.eval yolox-s -c yolox_s.pth -b 64 --conf 0.001 [--fp16] [--fuse]
                                yolox-m
                                yolox-l
                                yolox-x
@@ -186,7 +143,7 @@ python -m yolox.tools.eval -n  yolox-s -c yolox_s.pth -b 64 -d 8 --conf 0.001 [-
 
 To reproduce speed test, we use the following command:
 ```shell
-python -m yolox.tools.eval -n  yolox-s -c yolox_s.pth -b 1 -d 1 --conf 0.001 --fp16 --fuse
+python -m yolox.tools.eval -n  yolox-s -c yolox_s.pth -b 1  --conf 0.001 --fp16 --fuse
                                yolox-m
                                yolox-l
                                yolox-x
diff --git a/demo/MegEngine/python/models/darknet.py b/demo/MegEngine/python/models/darknet.py
index 47469aa68..a896e8610 100644
--- a/demo/MegEngine/python/models/darknet.py
+++ b/demo/MegEngine/python/models/darknet.py
@@ -3,9 +3,11 @@
 # Copyright (c) Megvii Inc. All rights reserved.
 
 import megengine.module as M
+from yolox.utils.device_utils import get_xla_model
 
 from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
 
+xm = get_xla_model()
 
 class Darknet(M.Module):
     # number of blocks from dark2 to dark5.
@@ -70,6 +72,10 @@ def make_spp_block(self, filters_list, in_filters):
         return m
 
     def forward(self, x):
+
+        if xm:
+            xm.mark_step()
+
         outputs = {}
         x = self.stem(x)
         outputs["stem"] = x
@@ -81,6 +87,10 @@ def forward(self, x):
         outputs["dark4"] = x
         x = self.dark5(x)
         outputs["dark5"] = x
+
+        if xm:
+            xm.mark_step()
+
         return {k: v for k, v in outputs.items() if k in self.out_features}
 
 
@@ -140,6 +150,10 @@ def __init__(
         )
 
     def forward(self, x):
+
+        if xm:
+            xm.mark_step()
+
         outputs = {}
         x = self.stem(x)
         outputs["stem"] = x
@@ -151,4 +165,8 @@ def forward(self, x):
         outputs["dark4"] = x
         x = self.dark5(x)
         outputs["dark5"] = x
+
+        if xm:
+            xm.mark_step()
+            
         return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/demo/MegEngine/python/models/yolo_fpn.py b/demo/MegEngine/python/models/yolo_fpn.py
index 675a7f6e6..af6c2727c 100644
--- a/demo/MegEngine/python/models/yolo_fpn.py
+++ b/demo/MegEngine/python/models/yolo_fpn.py
@@ -4,11 +4,14 @@
 
 import megengine.functional as F
 import megengine.module as M
+from yolox.utils.device_utils import get_xla_model
 
 from .darknet import Darknet
 from .network_blocks import BaseConv, UpSample
 
 
+xm = get_xla_model()
+
 class YOLOFPN(M.Module):
     """
     YOLOFPN module. Darknet 53 is the default backbone of this model.
@@ -59,6 +62,9 @@ def forward(self, inputs):
             Tuple[Tensor]: FPN output features..
         """
         #  backbone
+        if xm:
+            xm.mark_step()
+            
         out_features = self.backbone(inputs)
         x2, x1, x0 = [out_features[f] for f in self.in_features]
 
@@ -75,4 +81,8 @@ def forward(self, inputs):
         out_dark3 = self.out2(x2_in)
 
         outputs = (out_dark3, out_dark4, x0)
+
+        if xm:
+            xm.mark_step()
+
         return outputs
diff --git a/demo/MegEngine/python/models/yolo_head.py b/demo/MegEngine/python/models/yolo_head.py
index 7bba674d5..9c7b37d43 100644
--- a/demo/MegEngine/python/models/yolo_head.py
+++ b/demo/MegEngine/python/models/yolo_head.py
@@ -4,6 +4,7 @@
 
 import megengine.functional as F
 import megengine.module as M
+from yolox.utils.device_utils import parse_dtype
 
 from .network_blocks import BaseConv, DWConv
 
@@ -154,14 +155,16 @@ def forward(self, xin, labels=None, imgs=None):
             return outputs
 
     def get_output_and_grid(self, output, k, stride, dtype):
-        grid = self.grids[k]
 
+        device, dtype = parse_dtype(dtype)
+        grid = self.grids[k]
+       
         batch_size = output.shape[0]
         n_ch = 5 + self.num_classes
         hsize, wsize = output.shape[-2:]
         if grid.shape[2:4] != output.shape[2:4]:
             yv, xv = meshgrid([F.arange(hsize), F.arange(wsize)])
-            grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2).type(dtype)
+            grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2).to(device=device, dtype=dtype)
             self.grids[k] = grid
 
         output = output.view(batch_size, self.n_anchors, n_ch, hsize, wsize)
diff --git a/demo/MegEngine/python/models/yolo_pafpn.py b/demo/MegEngine/python/models/yolo_pafpn.py
index 86154bfa9..fb6e591d0 100644
--- a/demo/MegEngine/python/models/yolo_pafpn.py
+++ b/demo/MegEngine/python/models/yolo_pafpn.py
@@ -4,10 +4,12 @@
 
 import megengine.module as M
 import megengine.functional as F
+from yolox.utils.device_utils import get_xla_model
 
 from .darknet import CSPDarknet
 from .network_blocks import BaseConv, CSPLayer, DWConv, UpSample
 
+xm = get_xla_model()
 
 class YOLOPAFPN(M.Module):
     """
@@ -85,6 +87,10 @@ def forward(self, input):
         """
 
         #  backbone
+
+        if xm:
+            xm.mark_step()
+
         out_features = self.backbone(input)
         features = [out_features[f] for f in self.in_features]
         [x2, x1, x0] = features
@@ -108,4 +114,8 @@ def forward(self, input):
         pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
 
         outputs = (pan_out2, pan_out1, pan_out0)
+
+        if xm:
+            xm.mark_step()
+
         return outputs
diff --git a/demo/nebullvm/nebullvm_optimization.py b/demo/nebullvm/nebullvm_optimization.py
index b817baf62..b9365dad2 100644
--- a/demo/nebullvm/nebullvm_optimization.py
+++ b/demo/nebullvm/nebullvm_optimization.py
@@ -1,18 +1,19 @@
+from yolox.utils.device_utils import get_current_device
 import torch
 import time
 from nebullvm.api.functions import optimize_model # Install DL compilers
 from yolox.exp import get_exp
 
+device = get_current_device()
+
 # Get YOLO model
 exp = get_exp(None, 'yolox-s') # select model name
 model = exp.get_model()
-model.cuda()
+model.to(device=device)
 model.eval()
 
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
 # Create dummy data for the optimizer
-input_data =  [((torch.randn(1, 3, 640, 640).to(device), ), 0) for i in range(100)] 
+input_data =  [((torch.randn(1, 3, 640, 640).to(device=device), ), 0) for i in range(100)] 
 
 # ---------- Optimization ---------- 
 optimized_model = optimize_model(model, input_data=input_data, optimization_time="constrained")  # Optimization without performance loss
@@ -22,7 +23,7 @@
 # Select image to test the latency of the optimized model
 
 # Create dummy image
-img = torch.randn(1, 3, 640, 640).to(device)
+img = torch.randn(1, 3, 640, 640).to(device=device)
 
 # Check perfomance
 warmup_iters = 30
diff --git a/docker-cuda.sh b/docker-cuda.sh
new file mode 100755
index 000000000..8272e2e15
--- /dev/null
+++ b/docker-cuda.sh
@@ -0,0 +1 @@
+docker run -t -d -v /home/ubuntu/efs/datasets:/datasets -v /home/ubuntu/efs/git/YOLOX-x:/app --shm-size=16g --net=host --gpus all docker.io/library/yolox-x-cuda:latest  sleep infinity
\ No newline at end of file
diff --git a/docker-neuron.sh b/docker-neuron.sh
new file mode 100755
index 000000000..52aa59761
--- /dev/null
+++ b/docker-neuron.sh
@@ -0,0 +1,24 @@
+docker run -t -d \
+    -v /home/ubuntu/efs/datasets:/datasets \
+    -v /home/ubuntu/efs/git/YOLOX-x:/app \
+    -v /tmp:/cache \
+    --shm-size=16g \
+    --net=host \
+    --shm-size=16g \
+    --device=/dev/neuron0 \
+    --device=/dev/neuron1 \
+    --device=/dev/neuron2 \
+    --device=/dev/neuron3 \
+    --device=/dev/neuron4 \
+    --device=/dev/neuron5 \
+    --device=/dev/neuron6 \
+    --device=/dev/neuron7 \
+    --device=/dev/neuron8 \
+    --device=/dev/neuron9 \
+    --device=/dev/neuron10 \
+    --device=/dev/neuron11 \
+    --device=/dev/neuron12 \
+    --device=/dev/neuron13 \
+    --device=/dev/neuron14 \
+    --device=/dev/neuron15 \
+    docker.io/library/yolox-x-neuron:latest  sleep infinity
\ No newline at end of file
diff --git a/docker-xla-cuda.sh b/docker-xla-cuda.sh
new file mode 100755
index 000000000..56cb32a94
--- /dev/null
+++ b/docker-xla-cuda.sh
@@ -0,0 +1 @@
+docker run -t -d -v /home/ubuntu/efs/datasets:/datasets -v /home/ubuntu/efs/git/YOLOX-x:/app --shm-size=16g --net=host --gpus all docker.io/library/yolox-x-xla:latest  sleep infinity
\ No newline at end of file
diff --git a/run-cuda.sh b/run-cuda.sh
new file mode 100755
index 000000000..6ebef065d
--- /dev/null
+++ b/run-cuda.sh
@@ -0,0 +1,5 @@
+export OMP_NUM_THREADS=16 
+export LOGURU_LEVEL="INFO"
+export YOLOX_DATADIR=/datasets
+export YOLOX_OUPUT_DIR="./YOLOX_cuda_outputs"
+torchrun --standalone --nproc_per_node=8 tools/train.py -b 32 -n yolox-s
diff --git a/run-neuron.sh b/run-neuron.sh
new file mode 100755
index 000000000..66d3a4978
--- /dev/null
+++ b/run-neuron.sh
@@ -0,0 +1,12 @@
+export YOLOX_DATADIR=/datasets
+export YOLOX_OUPUT_DIR="./YOLOX_neuron_outputs"
+export OMP_NUM_THREADS=16
+export LOGURU_LEVEL="INFO"
+export NEURON_CC_FLAGS="--cache_dir=/cache --model-type=generic"
+export NEURON_RT_STOCHASTIC_ROUNDING_EN="1"
+export XLA_IR_SHAPE_CACHE_SIZE="20480"
+export XLA_IR_DEBUG=0
+export XLA_HLO_DEBUG=0
+export PT_XLA_DEBUG=0
+export PT_XLA_DEBUG_FILE=./pt_xla_debug.txt
+torchrun --standalone --nproc_per_node=32 tools/train.py -b 128 -n yolox-s
diff --git a/run-xla-cuda.sh b/run-xla-cuda.sh
new file mode 100755
index 000000000..05a857fbd
--- /dev/null
+++ b/run-xla-cuda.sh
@@ -0,0 +1,9 @@
+export YOLOX_DATADIR=/datasets
+export YOLOX_OUPUT_DIR="./YOLOX_xla_cuda_outputs"
+export OMP_NUM_THREADS=16
+export LOGURU_LEVEL=INFO
+#export XLA_IR_DEBUG=1
+#export XLA_HLO_DEBUG=1
+#export PT_XLA_DEBUG=1
+#export PT_XLA_DEBUG_FILE=./pt_xla_debug.txt
+torchrun --standalone --nproc_per_node=8 tools/train.py -b 32 -n yolox-s
diff --git a/setup.py b/setup.py
index 5fec79764..b523dc0d8 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@ def get_long_description():
 
 def get_ext_modules():
     ext_module = []
-    if sys.platform != "win32":  # pre-compile ops on linux
+    if torch.cuda.is_available() and sys.platform != "win32":  # pre-compile ops on linux
         assert TORCH_AVAILABLE, "torch is required for pre-compiling ops, please install it first."
         # if any other op is added, please also add it here
         from yolox.layers import FastCOCOEvalOp
@@ -64,8 +64,8 @@ def get_cmd_class():
 setuptools.setup(
     name="yolox",
     version=get_yolox_version(),
-    author="megvii basedet team",
-    url="https://github.com/Megvii-BaseDetection/YOLOX",
+    author="ajayvohra2005",
+    url="https://github.com/ajayvohra2005/YOLOX-x",
     package_dir=get_package_dir(),
     packages=setuptools.find_packages(exclude=("tests", "tools")) + list(get_package_dir().keys()),
     python_requires=">=3.6",
@@ -82,7 +82,7 @@ def get_cmd_class():
     ],
     project_urls={
         "Documentation": "https://yolox.readthedocs.io",
-        "Source": "https://github.com/Megvii-BaseDetection/YOLOX",
-        "Tracker": "https://github.com/Megvii-BaseDetection/YOLOX/issues",
+        "Source": "https://github.com/ajayvohra2005/YOLOX-x",
+        "Tracker": "https://github.com/ajayvohra2005/YOLOX-x/issues",
     },
 )
diff --git a/tools/demo.py b/tools/demo.py
index b16598d5f..d1ea45e3f 100644
--- a/tools/demo.py
+++ b/tools/demo.py
@@ -9,6 +9,7 @@
 
 import cv2
 
+from yolox.utils.device_utils import get_current_device
 import torch
 
 from yolox.data.data_augment import ValTransform
@@ -46,12 +47,6 @@ def make_parser():
         help="please input your experiment description file",
     )
     parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
-    parser.add_argument(
-        "--device",
-        default="cpu",
-        type=str,
-        help="device to run our model, can either be cpu or gpu",
-    )
     parser.add_argument("--conf", default=0.3, type=float, help="test conf")
     parser.add_argument("--nms", default=0.3, type=float, help="test nms threshold")
     parser.add_argument("--tsize", default=None, type=int, help="test img size")
@@ -105,7 +100,6 @@ def __init__(
         cls_names=COCO_CLASSES,
         trt_file=None,
         decoder=None,
-        device="cpu",
         fp16=False,
         legacy=False,
     ):
@@ -116,7 +110,6 @@ def __init__(
         self.confthre = exp.test_conf
         self.nmsthre = exp.nmsthre
         self.test_size = exp.test_size
-        self.device = device
         self.fp16 = fp16
         self.preproc = ValTransform(legacy=legacy)
         if trt_file is not None:
@@ -125,7 +118,7 @@ def __init__(
             model_trt = TRTModule()
             model_trt.load_state_dict(torch.load(trt_file))
 
-            x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
+            x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).to(device=get_current_device())
             self.model(x)
             self.model = model_trt
 
@@ -148,10 +141,10 @@ def inference(self, img):
         img, _ = self.preproc(img, None, self.test_size)
         img = torch.from_numpy(img).unsqueeze(0)
         img = img.float()
-        if self.device == "gpu":
-            img = img.cuda()
-            if self.fp16:
-                img = img.half()  # to FP16
+        
+        img = img.to(device=get_current_device())
+        if self.fp16:
+            img = img.half()  # to FP16
 
         with torch.no_grad():
             t0 = time.time()
@@ -253,9 +246,6 @@ def main(exp, args):
         vis_folder = os.path.join(file_name, "vis_res")
         os.makedirs(vis_folder, exist_ok=True)
 
-    if args.trt:
-        args.device = "gpu"
-
     logger.info("Args: {}".format(args))
 
     if args.conf is not None:
@@ -268,10 +258,10 @@ def main(exp, args):
     model = exp.get_model()
     logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
 
-    if args.device == "gpu":
-        model.cuda()
-        if args.fp16:
-            model.half()  # to FP16
+    
+    model.to(device=get_current_device())
+    if args.fp16:
+        model.half()  # to FP16
     model.eval()
 
     if not args.trt:
@@ -304,7 +294,7 @@ def main(exp, args):
 
     predictor = Predictor(
         model, exp, COCO_CLASSES, trt_file, decoder,
-        args.device, args.fp16, args.legacy,
+        args.fp16, args.legacy,
     )
     current_time = time.localtime()
     if args.demo == "image":
diff --git a/tools/eval.py b/tools/eval.py
index 83ad76be8..e11b3864f 100644
--- a/tools/eval.py
+++ b/tools/eval.py
@@ -8,8 +8,9 @@
 import warnings
 from loguru import logger
 
+from yolox.utils.device_utils import get_current_device, get_xla_model, set_manual_seed
 import torch
-import torch.backends.cudnn as cudnn
+    
 from torch.nn.parallel import DistributedDataParallel as DDP
 
 from yolox.core import launch
@@ -23,6 +24,7 @@
     setup_logger
 )
 
+xm = get_xla_model()
 
 def make_parser():
     parser = argparse.ArgumentParser("YOLOX Eval")
@@ -30,25 +32,7 @@ def make_parser():
     parser.add_argument("-n", "--name", type=str, default=None, help="model name")
 
     # distributed
-    parser.add_argument(
-        "--dist-backend", default="nccl", type=str, help="distributed backend"
-    )
-    parser.add_argument(
-        "--dist-url",
-        default=None,
-        type=str,
-        help="url used to set up distributed training",
-    )
     parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size")
-    parser.add_argument(
-        "-d", "--devices", default=None, type=int, help="device for training"
-    )
-    parser.add_argument(
-        "--num_machines", default=1, type=int, help="num of node for training"
-    )
-    parser.add_argument(
-        "--machine_rank", default=0, type=int, help="node rank for multi-node training"
-    )
     parser.add_argument(
         "-f",
         "--exp_file",
@@ -113,20 +97,23 @@ def make_parser():
 
 
 @logger.catch
-def main(exp, args, num_gpu):
+def main(exp, args):
+    assert (not args.trt or torch.cuda.is_available()), "--trt requires CUDA"
+
     if args.seed is not None:
-        random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        cudnn.deterministic = True
-        warnings.warn(
-            "You have chosen to seed testing. This will turn on the CUDNN deterministic setting, "
-        )
+        set_manual_seed(args.seed)
+        if torch.cuda.is_available():
+            torch.backends.cudnn.deterministic = True
+            warnings.warn(
+                "You have chosen to seed testing. This will turn on the CUDNN deterministic setting, "
+            )
 
-    is_distributed = num_gpu > 1
+    is_distributed = torch.distributed.is_initialized() 
 
-    # set environment variables for distributed training
-    configure_nccl()
-    cudnn.benchmark = True
+    # set environment variables for distributed training for CUDA
+    if torch.cuda.is_available():
+        configure_nccl()
+        torch.backends.cudnn.benchmark = True
 
     rank = get_local_rank()
 
@@ -153,8 +140,7 @@ def main(exp, args, num_gpu):
     evaluator.per_class_AP = True
     evaluator.per_class_AR = True
 
-    torch.cuda.set_device(rank)
-    model.cuda(rank)
+    model.to(device=get_current_device())
     model.eval()
 
     if not args.speed and not args.trt:
@@ -163,13 +149,17 @@ def main(exp, args, num_gpu):
         else:
             ckpt_file = args.ckpt
         logger.info("loading checkpoint from {}".format(ckpt_file))
-        loc = "cuda:{}".format(rank)
+        loc = get_current_device()
         ckpt = torch.load(ckpt_file, map_location=loc)
         model.load_state_dict(ckpt["model"])
         logger.info("loaded checkpoint done.")
 
     if is_distributed:
-        model = DDP(model, device_ids=[rank])
+        if xm:
+            xm.mark_step()
+            model = DDP(model, gradient_as_bucket_view=True)
+        else:
+            model = DDP(model)
 
     if args.fuse:
         logger.info("\tFusing model...")
@@ -205,16 +195,4 @@ def main(exp, args, num_gpu):
     if not args.experiment_name:
         args.experiment_name = exp.exp_name
 
-    num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
-    assert num_gpu <= torch.cuda.device_count()
-
-    dist_url = "auto" if args.dist_url is None else args.dist_url
-    launch(
-        main,
-        num_gpu,
-        args.num_machines,
-        args.machine_rank,
-        backend=args.dist_backend,
-        dist_url=dist_url,
-        args=(exp, args, num_gpu),
-    )
+    launch(main,args=(exp, args))
diff --git a/tools/train.py b/tools/train.py
index aa98bba30..8dd39946f 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -3,16 +3,15 @@
 # Copyright (c) Megvii, Inc. and its affiliates.
 
 import argparse
-import random
 import warnings
 from loguru import logger
 
 import torch
-import torch.backends.cudnn as cudnn
 
 from yolox.core import launch
 from yolox.exp import Exp, check_exp_value, get_exp
-from yolox.utils import configure_module, configure_nccl, configure_omp, get_num_devices
+from yolox.utils import configure_module, configure_nccl, configure_omp
+from yolox.utils.device_utils import set_manual_seed
 
 
 def make_parser():
@@ -21,19 +20,7 @@ def make_parser():
     parser.add_argument("-n", "--name", type=str, default=None, help="model name")
 
     # distributed
-    parser.add_argument(
-        "--dist-backend", default="nccl", type=str, help="distributed backend"
-    )
-    parser.add_argument(
-        "--dist-url",
-        default=None,
-        type=str,
-        help="url used to set up distributed training",
-    )
     parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size")
-    parser.add_argument(
-        "-d", "--devices", default=None, type=int, help="device for training"
-    )
     parser.add_argument(
         "-f",
         "--exp_file",
@@ -52,12 +39,6 @@ def make_parser():
         type=int,
         help="resume training start epoch",
     )
-    parser.add_argument(
-        "--num_machines", default=1, type=int, help="num of node for training"
-    )
-    parser.add_argument(
-        "--machine_rank", default=0, type=int, help="node rank for multi-node training"
-    )
     parser.add_argument(
         "--fp16",
         dest="fp16",
@@ -99,20 +80,24 @@ def make_parser():
 
 @logger.catch
 def main(exp: Exp, args):
+
+    assert (not args.occupy or torch.cuda.is_available()), "--occupy requires CUDA"
+
     if exp.seed is not None:
-        random.seed(exp.seed)
-        torch.manual_seed(exp.seed)
-        cudnn.deterministic = True
-        warnings.warn(
-            "You have chosen to seed training. This will turn on the CUDNN deterministic setting, "
-            "which can slow down your training considerably! You may see unexpected behavior "
-            "when restarting from checkpoints."
-        )
-
-    # set environment variables for distributed training
-    configure_nccl()
-    configure_omp()
-    cudnn.benchmark = True
+        set_manual_seed(exp.seed)
+        if torch.cuda.is_available():
+            torch.backends.cudnn.deterministic = True
+            warnings.warn(
+                "You have chosen to seed training. This will turn on the CUDNN deterministic setting, "
+                "which can slow down your training considerably! You may see unexpected behavior "
+                "when restarting from checkpoints."
+            )
+
+    # set environment variables for distributed training for CUDA
+    if torch.cuda.is_available():
+        configure_nccl()
+        configure_omp()
+        torch.backends.cudnn.benchmark = True
 
     trainer = exp.get_trainer(args)
     trainer.train()
@@ -128,19 +113,8 @@ def main(exp: Exp, args):
     if not args.experiment_name:
         args.experiment_name = exp.exp_name
 
-    num_gpu = get_num_devices() if args.devices is None else args.devices
-    assert num_gpu <= get_num_devices()
-
     if args.cache is not None:
+        logger.info(f"Dataset cache: {args.cache}; loading dataset before launch")
         exp.dataset = exp.get_dataset(cache=True, cache_type=args.cache)
 
-    dist_url = "auto" if args.dist_url is None else args.dist_url
-    launch(
-        main,
-        num_gpu,
-        args.num_machines,
-        args.machine_rank,
-        backend=args.dist_backend,
-        dist_url=dist_url,
-        args=(exp, args),
-    )
+    launch(main,args=(exp, args))
diff --git a/tools/trt.py b/tools/trt.py
index f2f6cee5c..b4b5f1cff 100644
--- a/tools/trt.py
+++ b/tools/trt.py
@@ -7,8 +7,10 @@
 import shutil
 from loguru import logger
 
-import tensorrt as trt
 import torch
+import tensorrt as trt
+from yolox.utils.device_utils import get_current_device
+
 from torch2trt import torch2trt
 
 from yolox.exp import get_exp
@@ -56,9 +58,9 @@ def main():
     model.load_state_dict(ckpt["model"])
     logger.info("loaded checkpoint done.")
     model.eval()
-    model.cuda()
+    model.to(device=get_current_device())
     model.head.decode_in_inference = False
-    x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
+    x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).to(device=get_current_device())
     model_trt = torch2trt(
         model,
         [x],
diff --git a/tools/visualize_assign.py b/tools/visualize_assign.py
index e75a5586b..2b9bdcfc4 100644
--- a/tools/visualize_assign.py
+++ b/tools/visualize_assign.py
@@ -8,6 +8,7 @@
 import warnings
 from loguru import logger
 
+from yolox.utils.device_utils import get_current_device_type
 import torch
 import torch.backends.cudnn as cudnn
 
@@ -35,7 +36,7 @@ def train_one_iter(self):
         inps, targets = self.exp.preprocess(inps, targets, self.input_size)
         data_end_time = time.time()
 
-        with torch.cuda.amp.autocast(enabled=self.amp_training):
+        with torch.autocast(get_current_device_type(), enabled=self.amp_training):
             path_prefix = os.path.join(self.vis_dir, f"assign_vis_{self.batch_cnt}_")
             self.model.visualize(inps, targets, path_prefix)
 
diff --git a/yolox/core/launch.py b/yolox/core/launch.py
index 9f8eec61e..461d3165e 100644
--- a/yolox/core/launch.py
+++ b/yolox/core/launch.py
@@ -5,143 +5,32 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 # Copyright (c) Megvii, Inc. and its affiliates.
 
-import sys
-from datetime import timedelta
-from loguru import logger
+import os
 
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
+from yolox.utils.dist import barrier, deinit_distributed, init_distributed
 
-import yolox.utils.dist as comm
 
 __all__ = ["launch"]
 
 
-DEFAULT_TIMEOUT = timedelta(minutes=30)
-
-
-def _find_free_port():
-    """
-    Find an available port of current machine / node.
-    """
-    import socket
-
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    # Binding to port 0 will cause the OS to find an available port for us
-    sock.bind(("", 0))
-    port = sock.getsockname()[1]
-    sock.close()
-    # NOTE: there is still a chance the port could be taken by other processes.
-    return port
-
-
 def launch(
     main_func,
-    num_gpus_per_machine,
-    num_machines=1,
-    machine_rank=0,
-    backend="nccl",
-    dist_url=None,
-    args=(),
-    timeout=DEFAULT_TIMEOUT,
+    args=()
 ):
     """
     Args:
         main_func: a function that will be called by `main_func(*args)`
-        num_machines (int): the total number of machines
-        machine_rank (int): the rank of this machine (one per machine)
-        dist_url (str): url to connect to for distributed training, including protocol
-                       e.g. "tcp://127.0.0.1:8686".
-                       Can be set to auto to automatically select a free port on localhost
         args (tuple): arguments passed to main_func
     """
-    world_size = num_machines * num_gpus_per_machine
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    rank = int(os.getenv("RANK", 0))
     if world_size > 1:
-        # https://github.com/pytorch/pytorch/pull/14391
-        # TODO prctl in spawned processes
-
-        if dist_url == "auto":
-            assert (
-                num_machines == 1
-            ), "dist_url=auto cannot work with distributed training."
-            port = _find_free_port()
-            dist_url = f"tcp://127.0.0.1:{port}"
-
-        start_method = "spawn"
-        cache = vars(args[1]).get("cache", False)
-
-        # To use numpy memmap for caching image into RAM, we have to use fork method
-        if cache:
-            assert sys.platform != "win32", (
-                "As Windows platform doesn't support fork method, "
-                "do not add --cache in your training command."
-            )
-            start_method = "fork"
+        init_distributed(world_size=world_size, rank=rank)
 
-        mp.start_processes(
-            _distributed_worker,
-            nprocs=num_gpus_per_machine,
-            args=(
-                main_func,
-                world_size,
-                num_gpus_per_machine,
-                machine_rank,
-                backend,
-                dist_url,
-                args,
-            ),
-            daemon=False,
-            start_method=start_method,
-        )
-    else:
+        barrier()
         main_func(*args)
-
-
-def _distributed_worker(
-    local_rank,
-    main_func,
-    world_size,
-    num_gpus_per_machine,
-    machine_rank,
-    backend,
-    dist_url,
-    args,
-    timeout=DEFAULT_TIMEOUT,
-):
-    assert (
-        torch.cuda.is_available()
-    ), "cuda is not available. Please check your installation."
-    global_rank = machine_rank * num_gpus_per_machine + local_rank
-    logger.info("Rank {} initialization finished.".format(global_rank))
-    try:
-        dist.init_process_group(
-            backend=backend,
-            init_method=dist_url,
-            world_size=world_size,
-            rank=global_rank,
-            timeout=timeout,
-        )
-    except Exception:
-        logger.error("Process group URL: {}".format(dist_url))
-        raise
-
-    # Setup the local process group (which contains ranks within the same machine)
-    assert comm._LOCAL_PROCESS_GROUP is None
-    num_machines = world_size // num_gpus_per_machine
-    for i in range(num_machines):
-        ranks_on_i = list(
-            range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
-        )
-        pg = dist.new_group(ranks_on_i)
-        if i == machine_rank:
-            comm._LOCAL_PROCESS_GROUP = pg
-
-    # synchronize is needed here to prevent a possible timeout after calling init_process_group
-    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
-    comm.synchronize()
-
-    assert num_gpus_per_machine <= torch.cuda.device_count()
-    torch.cuda.set_device(local_rank)
-
-    main_func(*args)
+        barrier()
+        
+        deinit_distributed()
+    else:
+        main_func(*args)
\ No newline at end of file
diff --git a/yolox/core/trainer.py b/yolox/core/trainer.py
index 8f8016e57..444ec8fd4 100644
--- a/yolox/core/trainer.py
+++ b/yolox/core/trainer.py
@@ -6,6 +6,7 @@
 import time
 from loguru import logger
 
+from yolox.utils.device_utils import get_current_device, get_current_device_type, get_xla_model
 import torch
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
@@ -34,6 +35,8 @@
 )
 
 
+xm = get_xla_model()
+
 class Trainer:
     def __init__(self, exp: Exp, args):
         # init function only defines some basic attr, other attrs like model, optimizer are built in
@@ -44,11 +47,20 @@ def __init__(self, exp: Exp, args):
         # training related attr
         self.max_epoch = exp.max_epoch
         self.amp_training = args.fp16
-        self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
+        if hasattr(torch, "GradScaler"):
+            self.scaler = torch.GradScaler(get_current_device_type(), enabled=args.fp16)
+        elif xm:
+            from torch_xla.amp import GradScaler
+            self.scaler = GradScaler(enabled=args.fp16)
+        elif torch.cuda.is_available():
+            self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
+        else:
+            self.scaler = torch.cpu.amp.GradScaler(enabled=args.fp16)
+
         self.is_distributed = get_world_size() > 1
         self.rank = get_rank()
         self.local_rank = get_local_rank()
-        self.device = "cuda:{}".format(self.local_rank)
+        self.device = get_current_device()
         self.use_model_ema = exp.ema
         self.save_history_ckpt = exp.save_history_ckpt
 
@@ -94,25 +106,39 @@ def train_in_iter(self):
             self.after_iter()
 
     def train_one_iter(self):
+        
         iter_start_time = time.time()
-
+        logger.debug(f"iter start: {time.time()}")
         inps, targets = self.prefetcher.next()
         inps = inps.to(self.data_type)
         targets = targets.to(self.data_type)
         targets.requires_grad = False
         inps, targets = self.exp.preprocess(inps, targets, self.input_size)
         data_end_time = time.time()
-
-        with torch.cuda.amp.autocast(enabled=self.amp_training):
+        logger.debug(f"input ready: {data_end_time}")
+        
+        if xm:
+            inps = inps.to(device=self.device)
+            targets = targets.to(device=self.device)
+            logger.debug(f"input shape: {inps.shape}")
+
+        logger.debug(f"forward: {time.time()}")
+        with torch.autocast(get_current_device_type(), enabled=self.amp_training):
             outputs = self.model(inps, targets)
 
         loss = outputs["total_loss"]
-
+        if xm:
+            loss = loss.to(device=self.device)
         self.optimizer.zero_grad()
-        self.scaler.scale(loss).backward()
+        scaled_loss = self.scaler.scale(loss)
+        logger.debug(f"backward: {time.time()}")
+        scaled_loss.backward()
         self.scaler.step(self.optimizer)
         self.scaler.update()
-
+        if xm:
+            xm.mark_step()
+        logger.debug(f"optimizer step: {time.time()}")
+        
         if self.use_model_ema:
             self.ema_model.update(self.model)
 
@@ -127,13 +153,17 @@ def train_one_iter(self):
             lr=lr,
             **outputs,
         )
+        if xm:
+            xm.mark_step()
+        logger.debug(f"iter end: {time.time()}")
+
+      
 
     def before_train(self):
         logger.info("args: {}".format(self.args))
         logger.info("exp value:\n{}".format(self.exp))
 
         # model related init
-        torch.cuda.set_device(self.local_rank)
         model = self.exp.get_model()
         logger.info(
             "Model Summary: {}".format(get_model_info(model, self.exp.test_size))
@@ -166,7 +196,11 @@ def before_train(self):
             occupy_mem(self.local_rank)
 
         if self.is_distributed:
-            model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False)
+            if xm:
+                xm.mark_step()
+                model = DDP(model, broadcast_buffers=False, gradient_as_bucket_view=True)
+            else:
+                model = DDP(model, broadcast_buffers=False)
 
         if self.use_model_ema:
             self.ema_model = ModelEMA(model, 0.9998)
@@ -265,7 +299,10 @@ def after_iter(self):
                 ["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()]
             )
 
-            mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage())
+            if torch.cuda.is_available():
+                mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage())
+            else:
+                mem_str = "mem: {:.1f}Gb".format(mem_usage())
 
             logger.info(
                 "{}, {}, {}, {}, lr: {:.3e}".format(
@@ -299,7 +336,7 @@ def after_iter(self):
             self.meter.clear_meters()
 
         # random resizing
-        if (self.progress_in_iter + 1) % 10 == 0:
+        if (self.progress_in_iter + 1) % self.exp.random_size_interval == 0:
             self.input_size = self.exp.random_resize(
                 self.train_loader, self.epoch, self.rank, self.is_distributed
             )
@@ -381,6 +418,7 @@ def evaluate_and_save_model(self):
             logger.info("\n" + summary)
         synchronize()
 
+        logger.info(f"Save checkpoints start: {time.time()}")
         self.save_ckpt("last_epoch", update_best_ckpt, ap=ap50_95)
         if self.save_history_ckpt:
             self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)
@@ -396,6 +434,8 @@ def evaluate_and_save_model(self):
             self.mlflow_logger.save_checkpoints(self.args, self.exp, self.file_name, self.epoch,
                                                 metadata, update_best_ckpt)
 
+        logger.info(f"Save checkpoints end: {time.time()}")
+
     def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
         if self.rank == 0:
             save_model = self.ema_model.ema if self.use_model_ema else self.model
diff --git a/yolox/data/data_prefetcher.py b/yolox/data/data_prefetcher.py
index a118cf4e4..c9b299a18 100644
--- a/yolox/data/data_prefetcher.py
+++ b/yolox/data/data_prefetcher.py
@@ -4,7 +4,6 @@
 
 import torch
 
-
 class DataPrefetcher:
     """
     DataPrefetcher is inspired by code of following file:
@@ -15,9 +14,11 @@ class DataPrefetcher:
 
     def __init__(self, loader):
         self.loader = iter(loader)
-        self.stream = torch.cuda.Stream()
-        self.input_cuda = self._input_cuda_for_image
-        self.record_stream = DataPrefetcher._record_stream_for_image
+
+        if torch.cuda.is_available():
+            self.stream = torch.cuda.Stream()
+            self.input_cuda = self._input_cuda_for_image
+            self.record_stream = DataPrefetcher._record_stream_for_image
         self.preload()
 
     def preload(self):
@@ -28,18 +29,26 @@ def preload(self):
             self.next_target = None
             return
 
-        with torch.cuda.stream(self.stream):
-            self.input_cuda()
-            self.next_target = self.next_target.cuda(non_blocking=True)
+        if torch.cuda.is_available():
+            with torch.cuda.stream(self.stream):
+                self.input_cuda()
+                self.next_target = self.next_target.cuda(non_blocking=True)
+        else:
+            self.next_input = self.next_input
+            self.next_target = self.next_target
 
     def next(self):
-        torch.cuda.current_stream().wait_stream(self.stream)
-        input = self.next_input
-        target = self.next_target
-        if input is not None:
-            self.record_stream(input)
-        if target is not None:
-            target.record_stream(torch.cuda.current_stream())
+        if torch.cuda.is_available():
+            torch.cuda.current_stream().wait_stream(self.stream)
+            input = self.next_input
+            target = self.next_target
+            if input is not None:
+                self.record_stream(input)
+            if target is not None:
+                target.record_stream(torch.cuda.current_stream())
+        else:
+            input = self.next_input
+            target = self.next_target
         self.preload()
         return input, target
 
diff --git a/yolox/evaluators/coco_evaluator.py b/yolox/evaluators/coco_evaluator.py
index e218c7456..f76a25932 100644
--- a/yolox/evaluators/coco_evaluator.py
+++ b/yolox/evaluators/coco_evaluator.py
@@ -26,7 +26,10 @@
     time_synchronized,
     xyxy2xywh
 )
+from yolox.utils.device_utils import get_current_device, get_xla_model
+from yolox.utils.dist import _get_global_gloo_group, get_rank
 
+xm = get_xla_model()
 
 def per_class_AR_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AR"], colums=6):
     per_class_AR = {}
@@ -132,14 +135,14 @@ def evaluate(
             summary (sr): summary info of evaluation.
         """
         # TODO half to amp_test
-        tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
+        tensor_type = torch.float16 if half else torch.float32
         model = model.eval()
         if half:
             model = model.half()
         ids = []
         data_list = []
         output_data = defaultdict()
-        progress_bar = tqdm if is_main_process() else iter
+        progress_bar = tqdm # if is_main_process() else iter
 
         inference_time = 0
         nms_time = 0
@@ -155,11 +158,12 @@ def evaluate(
             model(x)
             model = model_trt
 
+        model = model.to(device=get_current_device())
         for cur_iter, (imgs, _, info_imgs, ids) in enumerate(
             progress_bar(self.dataloader)
         ):
             with torch.no_grad():
-                imgs = imgs.type(tensor_type)
+                imgs = imgs.type(tensor_type).to(device=get_current_device())
 
                 # skip the last iters since batchsize might be not enough for batch inference
                 is_time_record = cur_iter < len(self.dataloader) - 1
@@ -185,17 +189,23 @@ def evaluate(
                 outputs, info_imgs, ids, return_outputs=True)
             data_list.extend(data_list_elem)
             output_data.update(image_wise_data)
-
-        statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
+  
+        statistics = torch.tensor([inference_time, nms_time, n_samples], 
+                                  dtype=torch.float32, 
+                                  device=get_current_device())
         if distributed:
             # different process/device might have different speed,
             # to make sure the process will not be stucked, sync func is used here.
-            synchronize()
-            data_list = gather(data_list, dst=0)
-            output_data = gather(output_data, dst=0)
+            group = _get_global_gloo_group()
+            synchronize(group=group)
+            data_list = gather(data_list, dst=0, group=group)
+            output_data = gather(output_data, dst=0, group=group)
             data_list = list(itertools.chain(*data_list))
             output_data = dict(ChainMap(*output_data))
-            torch.distributed.reduce(statistics, dst=0)
+            if xm:
+                torch.distributed.all_reduce(statistics)
+            else:
+                torch.distributed.reduce(statistics, dst=0)
 
         eval_results = self.evaluate_prediction(data_list, statistics)
         synchronize()
@@ -256,7 +266,10 @@ def evaluate_prediction(self, data_dict, statistics):
         if not is_main_process():
             return 0, 0, None
 
-        logger.info("Evaluate in main process...")
+        if xm:
+            xm.mark_step()
+
+        logger.info(f"Evaluate in main process: data_dict length: {len(data_dict)}, statistics: {statistics}")
 
         annType = ["segm", "bbox", "keypoints"]
 
@@ -278,7 +291,7 @@ def evaluate_prediction(self, data_dict, statistics):
         )
 
         info = time_info + "\n"
-
+        logger.info(f"time_info: {info} {time.time()}")
         # Evaluate the Dt (detection) json comparing with the ground truth
         if len(data_dict) > 0:
             cocoGt = self.dataloader.dataset.coco
@@ -289,6 +302,7 @@ def evaluate_prediction(self, data_dict, statistics):
             else:
                 _, tmp = tempfile.mkstemp()
                 json.dump(data_dict, open(tmp, "w"))
+                logger.info(f"load eval data: {tmp} {time.time()}")
                 cocoDt = cocoGt.loadRes(tmp)
             try:
                 from yolox.layers import COCOeval_opt as COCOeval
@@ -298,10 +312,13 @@ def evaluate_prediction(self, data_dict, statistics):
                 logger.warning("Use standard COCOeval.")
 
             cocoEval = COCOeval(cocoGt, cocoDt, annType[1])
+            logger.info(f"evaluate: {time.time()}")
             cocoEval.evaluate()
+            logger.info(f"accumulate: {time.time()}")
             cocoEval.accumulate()
             redirect_string = io.StringIO()
             with contextlib.redirect_stdout(redirect_string):
+                logger.info(f"summarize: {time.time()}")
                 cocoEval.summarize()
             info += redirect_string.getvalue()
             cat_ids = list(cocoGt.cats.keys())
@@ -312,6 +329,7 @@ def evaluate_prediction(self, data_dict, statistics):
             if self.per_class_AR:
                 AR_table = per_class_AR_table(cocoEval, class_names=cat_names)
                 info += "per class AR:\n" + AR_table + "\n"
+            logger.info(f"info completed: {time.time()}")
             return cocoEval.stats[0], cocoEval.stats[1], info
         else:
             return 0, 0, info
diff --git a/yolox/evaluators/voc_evaluator.py b/yolox/evaluators/voc_evaluator.py
index 094df3d69..619f35c5b 100644
--- a/yolox/evaluators/voc_evaluator.py
+++ b/yolox/evaluators/voc_evaluator.py
@@ -14,7 +14,9 @@
 import torch
 
 from yolox.utils import gather, is_main_process, postprocess, synchronize, time_synchronized
+from yolox.utils.device_utils import get_current_device, get_xla_model
 
+xm = get_xla_model()
 
 class VOCEvaluator:
     """
@@ -57,7 +59,7 @@ def evaluate(
             summary (sr): summary info of evaluation.
         """
         # TODO half to amp_test
-        tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
+        tensor_type = torch.float16 if half else torch.float32
         model = model.eval()
         if half:
             model = model.half()
@@ -81,7 +83,7 @@ def evaluate(
 
         for cur_iter, (imgs, _, info_imgs, ids) in enumerate(progress_bar(self.dataloader)):
             with torch.no_grad():
-                imgs = imgs.type(tensor_type)
+                imgs = imgs.type(tensor_type).to(device=get_current_device())
 
                 # skip the last iters since batchsize might be not enough for batch inference
                 is_time_record = cur_iter < len(self.dataloader) - 1
@@ -105,11 +107,16 @@ def evaluate(
 
             data_list.update(self.convert_to_voc_format(outputs, info_imgs, ids))
 
-        statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
+        statistics = torch.tensor([inference_time, nms_time, n_samples], 
+                                  dtype=torch.float32, 
+                                  device=get_current_device())
         if distributed:
             data_list = gather(data_list, dst=0)
             data_list = ChainMap(*data_list)
-            torch.distributed.reduce(statistics, dst=0)
+            if xm:
+                torch.distributed.all_reduce(statistics)
+            else:
+                torch.distributed.reduce(statistics, dst=0)
 
         eval_results = self.evaluate_prediction(data_list, statistics)
         synchronize()
diff --git a/yolox/exp/base_exp.py b/yolox/exp/base_exp.py
index 7ccfec5c2..85157750d 100644
--- a/yolox/exp/base_exp.py
+++ b/yolox/exp/base_exp.py
@@ -2,6 +2,7 @@
 # Copyright (c) Megvii Inc. All rights reserved.
 
 import ast
+import os
 import pprint
 from abc import ABCMeta, abstractmethod
 from typing import Dict, List, Tuple
@@ -17,8 +18,8 @@ class BaseExp(metaclass=ABCMeta):
     """Basic class for any experiment."""
 
     def __init__(self):
-        self.seed = None
-        self.output_dir = "./YOLOX_outputs"
+        self.seed = 2024
+        self.output_dir = os.getenv("YOLOX_OUTPUT_DIR", "./YOLOX_outputs")
         self.print_interval = 100
         self.eval_interval = 10
         self.dataset = None
diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
index 82e93c21b..e9707979f 100644
--- a/yolox/exp/yolox_base.py
+++ b/yolox/exp/yolox_base.py
@@ -4,14 +4,18 @@
 import os
 import random
 
+from yolox.utils.device_utils import get_current_device, get_xla_model
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 
+from yolox.utils.dist import barrier, synchronize
+
 from .base_exp import BaseExp
 
 __all__ = ["Exp", "check_exp_value"]
 
+xm = get_xla_model()
 
 class Exp(BaseExp):
     def __init__(self):
@@ -108,6 +112,8 @@ def __init__(self):
         # nms threshold
         self.nmsthre = 0.65
 
+        self.random_size_interval = 10
+
     def get_model(self):
         from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
 
@@ -222,7 +228,9 @@ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: s
         return train_loader
 
     def random_resize(self, data_loader, epoch, rank, is_distributed):
-        tensor = torch.LongTensor(2).cuda()
+
+        device = get_current_device()
+        tensor = torch.LongTensor(2).to(device=device)
 
         if rank == 0:
             size_factor = self.input_size[1] * 1.0 / self.input_size[0]
@@ -236,7 +244,7 @@ def random_resize(self, data_loader, epoch, rank, is_distributed):
             tensor[1] = size[1]
 
         if is_distributed:
-            dist.barrier()
+            barrier()
             dist.broadcast(tensor, 0)
 
         input_size = (tensor[0].item(), tensor[1].item())
diff --git a/yolox/layers/__init__.py b/yolox/layers/__init__.py
index fc9cf5138..6c51e080d 100644
--- a/yolox/layers/__init__.py
+++ b/yolox/layers/__init__.py
@@ -5,7 +5,10 @@
 # import torch first to make jit op work without `ImportError of libc10.so`
 import torch  # noqa
 
-from .jit_ops import FastCOCOEvalOp, JitOp
+try:
+    from .jit_ops import FastCOCOEvalOp, JitOp
+except ImportError:
+    pass
 
 try:
     from .fast_coco_eval_api import COCOeval_opt
diff --git a/yolox/layers/fast_coco_eval_api.py b/yolox/layers/fast_coco_eval_api.py
index 5f3aeb551..a6b9f6b77 100644
--- a/yolox/layers/fast_coco_eval_api.py
+++ b/yolox/layers/fast_coco_eval_api.py
@@ -11,6 +11,12 @@
 import numpy as np
 from pycocotools.cocoeval import COCOeval
 
+try:
+    import torch
+    assert torch.cuda.is_available()
+except AssertionError as e:
+    raise ImportError()
+
 from .jit_ops import FastCOCOEvalOp
 
 
diff --git a/yolox/layers/jit_ops.py b/yolox/layers/jit_ops.py
index 0fdac4de2..0165640fc 100644
--- a/yolox/layers/jit_ops.py
+++ b/yolox/layers/jit_ops.py
@@ -10,6 +10,11 @@
 
 __all__ = ["JitOp", "FastCOCOEvalOp"]
 
+try:
+    import torch
+    assert torch.cuda.is_available()
+except AssertionError as e:
+    raise ImportError()
 
 class JitOp:
     """
diff --git a/yolox/models/build.py b/yolox/models/build.py
index 8edc87de9..5344b4bb0 100644
--- a/yolox/models/build.py
+++ b/yolox/models/build.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding:utf-8 -*-
 
+from yolox.utils.device_utils import get_current_device
 import torch
 from torch import nn
 from torch.hub import load_state_dict_from_url
@@ -50,8 +51,7 @@ def create_yolox_model(name: str, pretrained: bool = True, num_classes: int = 80
     from yolox.exp import get_exp, Exp
 
     if device is None:
-        device = "cuda:0" if torch.cuda.is_available() else "cpu"
-    device = torch.device(device)
+        device = get_current_device()
 
     assert name in _CKPT_FULL_PATH or name == "yolox_custom", \
         f"user should use one of value in {_CKPT_FULL_PATH.keys()} or \"yolox_custom\""
@@ -75,7 +75,7 @@ def create_yolox_model(name: str, pretrained: bool = True, num_classes: int = 80
                 ckpt = ckpt["model"]
             yolox_model.load_state_dict(ckpt)
 
-    yolox_model.to(device)
+    yolox_model.to(device=device)
     return yolox_model
 
 
diff --git a/yolox/models/darknet.py b/yolox/models/darknet.py
index b3e053f16..0c1bfc634 100644
--- a/yolox/models/darknet.py
+++ b/yolox/models/darknet.py
@@ -4,8 +4,11 @@
 
 from torch import nn
 
+from yolox.utils.device_utils import get_xla_model
+
 from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
 
+xm = get_xla_model()
 
 class Darknet(nn.Module):
     # number of blocks from dark2 to dark5.
@@ -80,6 +83,10 @@ def make_spp_block(self, filters_list, in_filters):
         return m
 
     def forward(self, x):
+
+        if xm:
+            xm.mark_step()
+            
         outputs = {}
         x = self.stem(x)
         outputs["stem"] = x
@@ -91,6 +98,10 @@ def forward(self, x):
         outputs["dark4"] = x
         x = self.dark5(x)
         outputs["dark5"] = x
+
+        if xm:
+            xm.mark_step()
+
         return {k: v for k, v in outputs.items() if k in self.out_features}
 
 
@@ -165,6 +176,9 @@ def __init__(
         )
 
     def forward(self, x):
+        if xm:
+            xm.mark_step()
+
         outputs = {}
         x = self.stem(x)
         outputs["stem"] = x
@@ -176,4 +190,8 @@ def forward(self, x):
         outputs["dark4"] = x
         x = self.dark5(x)
         outputs["dark5"] = x
+
+        if xm:
+            xm.mark_step()
+
         return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/yolox/models/losses.py b/yolox/models/losses.py
index 77b4d8ef7..7c414c76a 100644
--- a/yolox/models/losses.py
+++ b/yolox/models/losses.py
@@ -2,9 +2,14 @@
 # -*- encoding: utf-8 -*-
 # Copyright (c) Megvii Inc. All rights reserved.
 
+import time
+from loguru import logger
 import torch
 import torch.nn as nn
 
+from yolox.utils.device_utils import get_xla_model, parse_dtype
+
+xm = get_xla_model()
 
 class IOUloss(nn.Module):
     def __init__(self, reduction="none", loss_type="iou"):
@@ -12,8 +17,8 @@ def __init__(self, reduction="none", loss_type="iou"):
         self.reduction = reduction
         self.loss_type = loss_type
 
-    def forward(self, pred, target):
-        assert pred.shape[0] == target.shape[0]
+    def forward(self, pred, target):      
+        assert pred.shape[0] == target.shape[0], f"pred shape: {pred.shape} target shape: {target.shape}"
 
         pred = pred.view(-1, 4)
         target = target.view(-1, 4)
@@ -27,7 +32,8 @@ def forward(self, pred, target):
         area_p = torch.prod(pred[:, 2:], 1)
         area_g = torch.prod(target[:, 2:], 1)
 
-        en = (tl < br).type(tl.type()).prod(dim=1)
+        device, dtype = parse_dtype(tl.type())
+        en = (tl < br).to(device=device, dtype=dtype).prod(dim=1)
         area_i = torch.prod(br - tl, 1) * en
         area_u = area_p + area_g - area_i
         iou = (area_i) / (area_u + 1e-16)
diff --git a/yolox/models/yolo_fpn.py b/yolox/models/yolo_fpn.py
index 224271f59..88f06ec84 100644
--- a/yolox/models/yolo_fpn.py
+++ b/yolox/models/yolo_fpn.py
@@ -5,10 +5,14 @@
 import torch
 import torch.nn as nn
 
+from yolox.utils.device_utils import get_xla_model
+
 from .darknet import Darknet
 from .network_blocks import BaseConv
 
 
+xm = get_xla_model()
+
 class YOLOFPN(nn.Module):
     """
     YOLOFPN module. Darknet 53 is the default backbone of this model.
@@ -65,6 +69,9 @@ def forward(self, inputs):
             Tuple[Tensor]: FPN output features..
         """
         #  backbone
+        if xm:
+            xm.mark_step()
+
         out_features = self.backbone(inputs)
         x2, x1, x0 = [out_features[f] for f in self.in_features]
 
@@ -81,4 +88,8 @@ def forward(self, inputs):
         out_dark3 = self.out2(x2_in)
 
         outputs = (out_dark3, out_dark4, x0)
+
+        if xm:
+            xm.mark_step()
+
         return outputs
diff --git a/yolox/models/yolo_head.py b/yolox/models/yolo_head.py
index 3e51768ee..d218266ab 100644
--- a/yolox/models/yolo_head.py
+++ b/yolox/models/yolo_head.py
@@ -3,6 +3,7 @@
 # Copyright (c) Megvii Inc. All rights reserved.
 
 import math
+import time
 from loguru import logger
 
 import torch
@@ -10,10 +11,12 @@
 import torch.nn.functional as F
 
 from yolox.utils import bboxes_iou, cxcywh2xyxy, meshgrid, visualize_assign
+from yolox.utils.device_utils import get_current_device, get_current_device_type, get_xla_model, parse_dtype
 
 from .losses import IOUloss
 from .network_blocks import BaseConv, DWConv
 
+xm = get_xla_model()
 
 class YOLOXHead(nn.Module):
     def __init__(
@@ -152,7 +155,7 @@ def forward(self, xin, labels=None, imgs=None):
             x = self.stems[k](x)
             cls_x = x
             reg_x = x
-
+            
             cls_feat = cls_conv(cls_x)
             cls_output = self.cls_preds[k](cls_feat)
 
@@ -187,12 +190,13 @@ def forward(self, xin, labels=None, imgs=None):
                 output = torch.cat(
                     [reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1
                 )
+            if xm:
+                xm.mark_step()
 
             outputs.append(output)
-
+            
         if self.training:
             return self.get_losses(
-                imgs,
                 x_shifts,
                 y_shifts,
                 expanded_strides,
@@ -213,6 +217,7 @@ def forward(self, xin, labels=None, imgs=None):
                 return outputs
 
     def get_output_and_grid(self, output, k, stride, dtype):
+        device, dtype = parse_dtype(dtype)
         grid = self.grids[k]
 
         batch_size = output.shape[0]
@@ -220,7 +225,7 @@ def get_output_and_grid(self, output, k, stride, dtype):
         hsize, wsize = output.shape[-2:]
         if grid.shape[2:4] != output.shape[2:4]:
             yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
-            grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype)
+            grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).to(device=device, dtype=dtype)
             self.grids[k] = grid
 
         output = output.view(batch_size, 1, n_ch, hsize, wsize)
@@ -233,6 +238,8 @@ def get_output_and_grid(self, output, k, stride, dtype):
         return output, grid
 
     def decode_outputs(self, outputs, dtype):
+        device, dtype = parse_dtype(dtype)
+
         grids = []
         strides = []
         for (hsize, wsize), stride in zip(self.hw, self.strides):
@@ -242,9 +249,9 @@ def decode_outputs(self, outputs, dtype):
             shape = grid.shape[:2]
             strides.append(torch.full((*shape, 1), stride))
 
-        grids = torch.cat(grids, dim=1).type(dtype)
-        strides = torch.cat(strides, dim=1).type(dtype)
-
+        grids = torch.cat(grids, dim=1).to(device=device, dtype=dtype)
+        strides = torch.cat(strides, dim=1).to(device=device, dtype=dtype)
+        
         outputs = torch.cat([
             (outputs[..., 0:2] + grids) * strides,
             torch.exp(outputs[..., 2:4]) * strides,
@@ -254,7 +261,6 @@ def decode_outputs(self, outputs, dtype):
 
     def get_losses(
         self,
-        imgs,
         x_shifts,
         y_shifts,
         expanded_strides,
@@ -263,6 +269,16 @@ def get_losses(
         origin_preds,
         dtype,
     ):
+        if xm:
+            xm.mark_step()
+            outputs = outputs.cpu()
+            labels = labels.cpu()
+            x_shifts = [ t.cpu() for t in x_shifts]
+            y_shifts = [ t.cpu() for t in y_shifts]
+            expanded_strides = [ t.cpu() for t in expanded_strides]
+            if self.use_l1:
+                origin_preds = [ t.cpu() for t in origin_preds]
+
         bbox_preds = outputs[:, :, :4]  # [batch, n_anchors_all, 4]
         obj_preds = outputs[:, :, 4:5]  # [batch, n_anchors_all, 1]
         cls_preds = outputs[:, :, 5:]  # [batch, n_anchors_all, n_cls]
@@ -285,7 +301,7 @@ def get_losses(
 
         num_fg = 0.0
         num_gts = 0.0
-
+        
         for batch_idx in range(outputs.shape[0]):
             num_gt = int(nlabel[batch_idx])
             num_gts += num_gt
@@ -299,7 +315,7 @@ def get_losses(
                 gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5]
                 gt_classes = labels[batch_idx, :num_gt, 0]
                 bboxes_preds_per_image = bbox_preds[batch_idx]
-
+                
                 try:
                     (
                         gt_matched_classes,
@@ -318,10 +334,10 @@ def get_losses(
                         y_shifts,
                         cls_preds,
                         obj_preds,
+                        mode=get_current_device_type()
                     )
-                except RuntimeError as e:
-                    # TODO: the string might change, consider a better way
-                    if "CUDA out of memory. " not in str(e):
+                except RuntimeError as e: 
+                    if xm or "CUDA out of memory. " not in str(e):
                         raise  # RuntimeError might not caused by CUDA OOM
 
                     logger.error(
@@ -329,7 +345,10 @@ def get_losses(
                            CPU mode is applied in this batch. If you want to avoid this issue, \
                            try to reduce the batch size or image size."
                     )
-                    torch.cuda.empty_cache()
+
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+
                     (
                         gt_matched_classes,
                         fg_mask,
@@ -350,7 +369,8 @@ def get_losses(
                         "cpu",
                     )
 
-                torch.cuda.empty_cache()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
                 num_fg += num_fg_img
 
                 cls_target = F.one_hot(
@@ -432,11 +452,10 @@ def get_assignments(
         y_shifts,
         cls_preds,
         obj_preds,
-        mode="gpu",
-    ):
-
-        if mode == "cpu":
-            print("-----------Using CPU for the Current Batch-------------")
+        mode=None,
+    ):  
+    
+        if mode == "cpu" or mode =="xla":
             gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
             bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
             gt_classes = gt_classes.cpu().float()
@@ -456,10 +475,10 @@ def get_assignments(
         obj_preds_ = obj_preds[batch_idx][fg_mask]
         num_in_boxes_anchor = bboxes_preds_per_image.shape[0]
 
-        if mode == "cpu":
+        if mode == "cpu" or mode =="xla":
             gt_bboxes_per_image = gt_bboxes_per_image.cpu()
             bboxes_preds_per_image = bboxes_preds_per_image.cpu()
-
+            
         pair_wise_ious = bboxes_iou(gt_bboxes_per_image, bboxes_preds_per_image, False)
 
         gt_cls_per_image = (
@@ -468,10 +487,10 @@ def get_assignments(
         )
         pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)
 
-        if mode == "cpu":
+        if mode == "cpu" or mode =="xla":
             cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu()
 
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.autocast(get_current_device_type(), enabled=False):
             cls_preds_ = (
                 cls_preds_.float().sigmoid_() * obj_preds_.float().sigmoid_()
             ).sqrt()
@@ -496,11 +515,13 @@ def get_assignments(
         ) = self.simota_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
         del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss
 
+        # do not move back for mode == "xla"
         if mode == "cpu":
-            gt_matched_classes = gt_matched_classes.cuda()
-            fg_mask = fg_mask.cuda()
-            pred_ious_this_matching = pred_ious_this_matching.cuda()
-            matched_gt_inds = matched_gt_inds.cuda()
+            device = get_current_device()
+            gt_matched_classes = gt_matched_classes.to(device=device)
+            fg_mask = fg_mask.to(device=device)
+            pred_ious_this_matching = pred_ious_this_matching.to(device=device)
+            matched_gt_inds = matched_gt_inds.to(device=device)
 
         return (
             gt_matched_classes,
@@ -542,7 +563,7 @@ def get_geometry_constraint(
         return anchor_filter, geometry_relation
 
     def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
-        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8).to(device=pair_wise_ious.device)
 
         n_candidate_k = min(10, pair_wise_ious.size(1))
         topk_ious, _ = torch.topk(pair_wise_ious, n_candidate_k, dim=1)
@@ -556,7 +577,7 @@ def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
         del topk_ious, dynamic_ks, pos_idx
 
         anchor_matching_gt = matching_matrix.sum(0)
-        # deal with the case that one anchor matches multiple ground-truths
+        # deal with the case that one anchor matches multiple ground-truths  
         if anchor_matching_gt.max() > 1:
             multiple_match_mask = anchor_matching_gt > 1
             _, cost_argmin = torch.min(cost[:, multiple_match_mask], dim=0)
diff --git a/yolox/models/yolo_pafpn.py b/yolox/models/yolo_pafpn.py
index 4c4e18a5c..99f5985f1 100644
--- a/yolox/models/yolo_pafpn.py
+++ b/yolox/models/yolo_pafpn.py
@@ -5,9 +5,12 @@
 import torch
 import torch.nn as nn
 
+from yolox.utils.device_utils import get_xla_model
+
 from .darknet import CSPDarknet
 from .network_blocks import BaseConv, CSPLayer, DWConv
 
+xm = get_xla_model()
 
 class YOLOPAFPN(nn.Module):
     """
@@ -90,6 +93,9 @@ def forward(self, input):
         """
 
         #  backbone
+        if xm:
+            xm.mark_step()
+
         out_features = self.backbone(input)
         features = [out_features[f] for f in self.in_features]
         [x2, x1, x0] = features
@@ -113,4 +119,8 @@ def forward(self, input):
         pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
 
         outputs = (pan_out2, pan_out1, pan_out0)
+
+        if xm:
+            xm.mark_step()
+
         return outputs
diff --git a/yolox/models/yolox.py b/yolox/models/yolox.py
index 744ceea81..42bc0a034 100644
--- a/yolox/models/yolox.py
+++ b/yolox/models/yolox.py
@@ -4,9 +4,12 @@
 
 import torch.nn as nn
 
+from yolox.utils.device_utils import get_xla_model
+
 from .yolo_head import YOLOXHead
 from .yolo_pafpn import YOLOPAFPN
 
+xm = get_xla_model()
 
 class YOLOX(nn.Module):
     """
@@ -27,6 +30,10 @@ def __init__(self, backbone=None, head=None):
 
     def forward(self, x, targets=None):
         # fpn output content features of [dark3, dark4, dark5]
+
+        if xm:
+            xm.mark_step()
+
         fpn_outs = self.backbone(x)
 
         if self.training:
@@ -45,6 +52,9 @@ def forward(self, x, targets=None):
         else:
             outputs = self.head(fpn_outs)
 
+        if xm:
+            xm.mark_step()
+
         return outputs
 
     def visualize(self, x, targets, save_prefix="assign_vis_"):
diff --git a/yolox/utils/allreduce_norm.py b/yolox/utils/allreduce_norm.py
index 142c76c78..4adf6dc65 100644
--- a/yolox/utils/allreduce_norm.py
+++ b/yolox/utils/allreduce_norm.py
@@ -5,11 +5,12 @@
 import pickle
 from collections import OrderedDict
 
+from yolox.utils.device_utils import get_current_device
 import torch
 from torch import distributed as dist
 from torch import nn
 
-from .dist import _get_global_gloo_group, get_world_size
+from yolox.utils.dist import _get_global_gloo_group, get_world_size
 
 ASYNC_NORM = (
     nn.BatchNorm1d,
@@ -38,7 +39,8 @@ def get_async_norm_states(module):
     return async_norm_states
 
 
-def pyobj2tensor(pyobj, device="cuda"):
+def pyobj2tensor(pyobj):
+    device = get_current_device()
     """serialize picklable python object to tensor"""
     storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
     return torch.ByteTensor(storage).to(device=device)
@@ -83,7 +85,7 @@ def all_reduce(py_dict, op="sum", group=None):
     tensor_numels = [py_dict[k].numel() for k in py_key]
 
     flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
-    dist.all_reduce(flatten_tensor, op=_get_reduce_op(op))
+    dist.all_reduce(flatten_tensor, op=_get_reduce_op(op), group=group)
     if op == "mean":
         flatten_tensor /= world_size
 
diff --git a/yolox/utils/boxes.py b/yolox/utils/boxes.py
index f71e8d90b..07b27cf8e 100644
--- a/yolox/utils/boxes.py
+++ b/yolox/utils/boxes.py
@@ -6,6 +6,8 @@
 import torch
 import torchvision
 
+from yolox.utils.device_utils import parse_dtype
+
 __all__ = [
     "filter_box",
     "postprocess",
@@ -97,7 +99,8 @@ def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
 
         area_a = torch.prod(bboxes_a[:, 2:], 1)
         area_b = torch.prod(bboxes_b[:, 2:], 1)
-    en = (tl < br).type(tl.type()).prod(dim=2)
+    device, dtype = parse_dtype(tl.type())
+    en = (tl < br).to(device=device, dtype=dtype).prod(dim=2)
     area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
     return area_i / (area_a[:, None] + area_b - area_i)
 
diff --git a/yolox/utils/device_utils.py b/yolox/utils/device_utils.py
new file mode 100644
index 000000000..ab419d3e2
--- /dev/null
+++ b/yolox/utils/device_utils.py
@@ -0,0 +1,156 @@
+import os
+import random
+import warnings
+
+from typing import Union
+import torch
+
+try:
+    import torch_xla.core.xla_model as xm
+    import torch_xla.runtime as xr
+    import torch_xla.distributed.xla_backend as xb
+
+    compiler_cache_path = os.getenv("XLA_CACHE_DIR", "./cache")
+    os.makedirs(compiler_cache_path, exist_ok=True)
+    try:
+        xr.initialize_cache(compiler_cache_path, readonly=False)
+    except AttributeError as e:
+        warnings.warn(f"can not set XLA cache dir: {e}")
+    
+except ImportError:
+    xm = None
+    xr = None
+    xb = None
+
+def get_xla_model():
+    return xm
+
+
+def get_xla_runtime():
+    return xr
+
+
+def get_current_device() -> torch.device:
+    global __current_device
+
+    try:
+        return __current_device
+    except NameError:
+        if xm is not None:
+            __current_device = xm.xla_device()
+        elif torch.cuda.is_available():
+            local_rank = int(os.getenv("LOCAL_RANK", 0))
+            __current_device = torch.device(f'cuda:{local_rank}')
+            torch.cuda.set_device(__current_device)
+        else:
+            device = os.getenv("DEFAULT_DEVICE", "cpu")
+            __current_device = torch.device(device)
+
+    return __current_device
+
+
+def get_current_device_type() -> str:
+    global __current_device_type
+
+    try:
+        return __current_device_type
+    except NameError:
+        if xm is not None:
+            __current_device_type = "xla"
+        elif torch.cuda.is_available():
+            __current_device_type = "cuda"
+        else:
+            __current_device_type = os.getenv("DEFAULT_DEVICE_TYPE", "cpu")
+
+    return __current_device_type
+
+
+def get_local_device_count() -> int:
+    device_count = 1
+
+    if xr is not None:
+        device_count = xr.global_device_count()
+    elif torch.cuda.is_available():
+        device_count = torch.cuda.device_count()
+    
+    return device_count
+
+
+def get_distributed_backend(backend=None) -> str:
+    if xm is not None:
+        backend = "xla"
+    elif torch.cuda.is_available():
+        backend = backend if backend is not None else "nccl"
+    else:
+        backend = backend if backend is not None else "gloo"
+
+    return backend
+
+
+def get_distributed_init_method() -> str:
+    if xm is not None:
+        init_method = 'xla://'
+    else:
+        init_method =  "env://"
+
+    return init_method
+
+
+def get_current_rng_state() -> Union[torch.Tensor, int]:
+    if torch.cuda.is_available():
+        rng_state = torch.cuda.get_rng_state(device=get_current_device())
+    elif xm:
+        rng_state = xm.get_rng_state(device=get_current_device())
+    else:
+        rng_state = torch.get_rng_state()
+
+    return rng_state
+
+
+def set_manual_seed(seed: int):
+    random.seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    elif xm is not None:
+        xm.set_rng_state(seed, device=get_current_device())
+    else:
+        torch.manual_seed(seed)
+
+
+def set_current_rng_state(new_state):
+    if torch.cuda.is_available():
+        new_state = new_state.type(torch.ByteTensor)
+        torch.cuda.set_rng_state(new_state, device=get_current_device())
+    elif xm is not None:
+        new_state = int(new_state)
+        xm.set_rng_state(new_state, device=get_current_device())
+    else:
+        new_state = new_state.type(torch.ByteTensor)
+        torch.set_rng_state(new_state)
+
+if xb:
+    def make_send_channel_id_impl(self, dst_rank, tag):
+        return int(dst_rank)*2
+
+    def make_recv_channel_id_impl(self, src_rank, tag):
+        return int(src_rank)*3
+    
+    xb.ProcessGroupXla.make_send_channel_id = make_send_channel_id_impl
+    xb.ProcessGroupXla.make_recv_channel_id = make_recv_channel_id_impl
+
+def parse_dtype(dtype: str):
+    d, t = dtype.rsplit(".", 1)
+
+    assert d in ['torch', 'torch.cuda', 'torch.xla']
+    assert t in [ 'FloatTensor', 'HalfTensor', 'BFloat16Tensor']
+
+    if t == "FloatTensor":
+        dtype = torch.float32
+    elif t == "HalfTensor":
+        dtype = torch.float16
+    elif t == "BFloat16Tensor":
+        dtype = torch.bfloat16
+
+    device = torch.device("cpu") if d == "torch" else get_current_device()
+   
+    return device, dtype
diff --git a/yolox/utils/dist.py b/yolox/utils/dist.py
index 9e8fea933..4160ec5a0 100644
--- a/yolox/utils/dist.py
+++ b/yolox/utils/dist.py
@@ -20,9 +20,9 @@
 
 import torch
 from torch import distributed as dist
+from yolox.utils.device_utils import get_current_device, get_distributed_backend, get_distributed_init_method, get_local_device_count, get_xla_model, xm
 
 __all__ = [
-    "get_num_devices",
     "wait_for_the_master",
     "is_main_process",
     "synchronize",
@@ -35,18 +35,8 @@
     "all_gather",
 ]
 
-_LOCAL_PROCESS_GROUP = None
-
-
-def get_num_devices():
-    gpu_list = os.getenv('CUDA_VISIBLE_DEVICES', None)
-    if gpu_list is not None:
-        return len(gpu_list.split(','))
-    else:
-        devices_list_info = os.popen("nvidia-smi -L")
-        devices_list_info = devices_list_info.read().strip().split("\n")
-        return len(devices_list_info)
-
+__DEFAULT_GLOO_GROUP = None
+xm = get_xla_model()
 
 @contextmanager
 def wait_for_the_master(local_rank: int = None):
@@ -61,7 +51,7 @@ def wait_for_the_master(local_rank: int = None):
         local_rank = get_local_rank()
 
     if local_rank > 0:
-        dist.barrier()
+        barrier()
     yield
     if local_rank == 0:
         if not dist.is_available():
@@ -69,10 +59,10 @@ def wait_for_the_master(local_rank: int = None):
         if not dist.is_initialized():
             return
         else:
-            dist.barrier()
+            barrier()
 
 
-def synchronize():
+def synchronize(group=None):
     """
     Helper function to synchronize (barrier) among all processes when using distributed training
     """
@@ -80,10 +70,11 @@ def synchronize():
         return
     if not dist.is_initialized():
         return
-    world_size = dist.get_world_size()
+    world_size = dist.get_world_size(group=group)
     if world_size == 1:
         return
-    dist.barrier()
+    
+    barrier(group=group)
 
 
 def get_world_size() -> int:
@@ -105,16 +96,9 @@ def get_rank() -> int:
 def get_local_rank() -> int:
     """
     Returns:
-        The rank of the current process within the local (per-machine) process group.
+        The rank of the current process within the local machine
     """
-    if _LOCAL_PROCESS_GROUP is None:
-        return get_rank()
-
-    if not dist.is_available():
-        return 0
-    if not dist.is_initialized():
-        return 0
-    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+    return int(os.getenv("LOCAL_RANK", 0))
 
 
 def get_local_size() -> int:
@@ -122,33 +106,24 @@ def get_local_size() -> int:
     Returns:
         The size of the per-machine process group, i.e. the number of processes per machine.
     """
-    if not dist.is_available():
-        return 1
-    if not dist.is_initialized():
-        return 1
-    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+    return get_local_device_count()
 
 
 def is_main_process() -> bool:
     return get_rank() == 0
 
-
-@functools.lru_cache()
 def _get_global_gloo_group():
     """
     Return a process group based on gloo backend, containing all the ranks
     The result is cached.
     """
-    if dist.get_backend() == "nccl":
-        return dist.new_group(backend="gloo")
-    else:
-        return dist.group.WORLD
-
-
+    global __DEFAULT_GLOO_GROUP
+    assert __DEFAULT_GLOO_GROUP is not None, "Gloo group is not initialized"
+    return __DEFAULT_GLOO_GROUP
+    
 def _serialize_to_tensor(data, group):
     backend = dist.get_backend(group)
-    assert backend in ["gloo", "nccl"]
-    device = torch.device("cpu" if backend == "gloo" else "cuda")
+    device = torch.device("cpu") if backend == "gloo" else get_current_device()
 
     buffer = pickle.dumps(data)
     if len(buffer) > 1024 ** 3:
@@ -211,8 +186,9 @@ def all_gather(data, group=None):
         return [data]
 
     tensor = _serialize_to_tensor(data, group)
-
+    synchronize(group=group)
     size_list, tensor = _pad_to_largest_tensor(tensor, group)
+    synchronize(group=group)
     max_size = max(size_list)
 
     # receiving Tensor from all ranks
@@ -246,14 +222,16 @@ def gather(data, dst=0, group=None):
     """
     if get_world_size() == 1:
         return [data]
-    if group is None:
-        group = _get_global_gloo_group()
     if dist.get_world_size(group=group) == 1:
         return [data]
+    if group is None:
+        group = _get_global_gloo_group()
     rank = dist.get_rank(group=group)
 
     tensor = _serialize_to_tensor(data, group)
+    synchronize(group=group)
     size_list, tensor = _pad_to_largest_tensor(tensor, group)
+    synchronize(group=group)
 
     # receiving Tensor from all ranks
     if rank == dst:
@@ -292,3 +270,34 @@ def time_synchronized():
     if torch.cuda.is_available():
         torch.cuda.synchronize()
     return time.time()
+
+
+def barrier(group=None):
+    dist.barrier(group=group)
+
+def init_distributed(world_size: int, rank: int):
+
+    if not dist.is_initialized():
+        init_method = get_distributed_init_method()
+        backend = get_distributed_backend()  
+
+        dist.init_process_group(backend=backend, 
+                                    world_size=world_size, 
+                                    rank=rank, 
+                                    init_method=init_method)
+        
+        global __DEFAULT_GLOO_GROUP
+        if __DEFAULT_GLOO_GROUP is None:
+            __DEFAULT_GLOO_GROUP = dist.new_group(backend="gloo")
+
+def deinit_distributed():
+    if dist.is_initialized():
+        global __DEFAULT_GLOO_GROUP
+        try:
+            if __DEFAULT_GLOO_GROUP is not None:
+                dist.destroy_process_group(group=__DEFAULT_GLOO_GROUP)
+        except Exception as e:
+            logger.warning(f"Error: {e}")
+        finally:
+            dist.destroy_process_group()
+       
\ No newline at end of file
diff --git a/yolox/utils/metric.py b/yolox/utils/metric.py
index 506b58281..d925aae68 100644
--- a/yolox/utils/metric.py
+++ b/yolox/utils/metric.py
@@ -11,6 +11,8 @@
 
 import torch
 
+from yolox.utils.device_utils import get_current_device
+
 __all__ = [
     "AverageMeter",
     "MeterBuffer",
@@ -22,6 +24,8 @@
 
 
 def get_total_and_free_memory_in_Mb(cuda_device):
+    assert torch.cuda.is_available()
+
     devices_info_str = os.popen(
         "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader"
     )
@@ -37,10 +41,12 @@ def occupy_mem(cuda_device, mem_ratio=0.9):
     """
     pre-allocate gpu memory for training to avoid memory Fragmentation.
     """
+    assert torch.cuda.is_available()
+
     total, used = get_total_and_free_memory_in_Mb(cuda_device)
     max_mem = int(total * mem_ratio)
     block_mem = max_mem - used
-    x = torch.cuda.FloatTensor(256, 1024, block_mem)
+    x = torch.tensor([256, 1024, block_mem], device=get_current_device())
     del x
     time.sleep(5)
 
@@ -49,6 +55,8 @@ def gpu_mem_usage():
     """
     Compute the GPU memory usage for the current device (MB).
     """
+    assert torch.cuda.is_available()
+    
     mem_usage_bytes = torch.cuda.max_memory_allocated()
     return mem_usage_bytes / (1024 * 1024)