diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py
index 214a33b..425983f 100644
--- a/yolox/data/datasets/voc.py
+++ b/yolox/data/datasets/voc.py
@@ -144,7 +144,7 @@ def pull_item(self, index):
 
         target = self.load_anno(index)
 
-        img_info = (width, height)
+        img_info = (height, width)
 
         return img, target, img_info, index
 
diff --git a/yolox/models/yolo_head.py b/yolox/models/yolo_head.py
index d0a08f7..8a0ccec 100644
--- a/yolox/models/yolo_head.py
+++ b/yolox/models/yolo_head.py
@@ -205,7 +205,7 @@ def get_output_and_grid(self, output, k, stride):
         n_ch = 5 + self.num_classes
         hsize, wsize = output.shape[-2:]
         if grid.shape[2:4] != output.shape[2:4]:
-            xv, yv = meshgrid(F.arange(hsize), F.arange(wsize))
+            xv, yv = meshgrid(F.arange(wsize), F.arange(hsize))
             grid = F.stack((xv, yv), 2).reshape(1, 1, hsize, wsize, 2)
             self.grids[k] = grid
 
@@ -223,7 +223,7 @@ def decode_outputs(self, outputs):
         grids = []
         strides = []
         for (hsize, wsize), stride in zip(self.hw, self.strides):
-            xv, yv = meshgrid(F.arange(hsize), F.arange(wsize))
+            xv, yv = meshgrid(F.arange(wsize), F.arange(hsize))
             grid = F.stack((xv, yv), 2).reshape(1, -1, 2)
             grids.append(grid)
             shape = grid.shape[:2]