weiaicunzai · nakroy · Sep 22, 2024 · Sep 22, 2024 · Sep 22, 2024 · Sep 22, 2024
diff --git a/README.md b/README.md
@@ -1,20 +1,16 @@
 # Pytorch-cifar100
-
-practice on cifar100 using pytorch
+This is a repository forked from weiaicunzai/pytorch-cifar100
 
 ## Requirements
-
-This is my experiment eviroument
-- python3.6
-- pytorch1.6.0+cu101
-- tensorboard 2.2.2(optional)
-
+- python >= 3.8
+- torch >= 2.0.0
+- tensorboard(optional)
 
 ## Usage
 
 ### 1. enter directory
 ```bash
-$ cd pytorch-cifar100
+$ cd pytorch-cifar100-ddp
 ```
 
 ### 2. dataset
@@ -32,14 +28,26 @@ $ tensorboard --logdir='runs' --port=6006 --host='localhost'
 ```
 
 ### 4. train the model
-You need to specify the net you want to train using arg -net
+You need to specify the net you want to train using arg --net
 
 ```bash
-# use gpu to train vgg16
-$ python train.py -net vgg16 -gpu
+# use cpu only (default) to train vgg16
+$ python train.py --net vgg16
+# use a single gpu to train vgg16
+$ python train.py --net vgg16 --gpu [gpu_id]
+# for example, use GPU 0 to train vgg16
+$ python train.py --net vgg16 --gpu 0
+
+# use multi gpus to train vgg16
+$ torchrun --master_addr [MASTER_ADDR] --master_port [MASTER_PORT] --nproc_per_node [NUM_GPUs_Per_Node] train.py --net vgg16 --gpu [gpu1_id,gpu2_id,...,gpun_id]
+# for example, use GPU 0 and GPU 1 in one node to train vgg16
+$ torchrun --master_addr localhost --master_port 6000 --nproc_per_node 2 train.py --net vgg16 --gpu 0,1
+
+# set training arguments if you want, for example, train 100 epochs and batch size is 256:
+$ python train.py --net vgg16 --gpu 0 --epoch 100 --batch 256
 ```
 
-sometimes, you might want to use warmup training by set ```-warm``` to 1 or 2, to prevent network
+sometimes, you might want to use warmup training by set ```--warmup``` to 1 or 2, to prevent network
 diverge during early training phase.
 
 The supported net args are:
@@ -92,12 +100,15 @@ Normally, the weights file with the best accuracy would be written to the disk w
 
 
 ### 5. test the model
-Test the model using test.py
+Test the model using test.py (Not implementing with DDP, which is not necessary...)
 ```bash
-$ python test.py -net vgg16 -weights path_to_vgg16_weights_file
+# use cpu (default)
+$ python test.py --net vgg16 --weights path_to_vgg16_weights_file
+# use gpu
+$ python test.py --net vgg16 --weights path_to_vgg16_weights_file --gpu 0
 ```
 
-## Implementated NetWork
+## Implemenztated NetWork
 
 - vgg [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556v6)
 - googlenet [Going Deeper with Convolutions](https://arxiv.org/abs/1409.4842v1)

diff --git a/test.py b/test.py
@@ -8,37 +8,56 @@
 author baiyu
 """
 
+import os
 import argparse
-
+from collections import OrderedDict
 from matplotlib import pyplot as plt
 
 import torch
-import torchvision.transforms as transforms
-from torch.utils.data import DataLoader
 
 from conf import settings
-from utils import get_network, get_test_dataloader
+from utils import get_network, get_dataloader
 
 if __name__ == '__main__':
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('-net', type=str, required=True, help='net type')
-    parser.add_argument('-weights', type=str, required=True, help='the weights file you want to test')
-    parser.add_argument('-gpu', action='store_true', default=False, help='use gpu or not')
-    parser.add_argument('-b', type=int, default=16, help='batch size for dataloader')
+    parser.add_argument('--net', type=str, required=True, help='net type')
+    parser.add_argument('--weights', type=str, required=True, help='the weights file you want to test')
+    parser.add_argument('--gpu', type=str, default='-1', help='gpu device id, set `-1` to use cpu only')
+    parser.add_argument('--batch', '-b', type=int, default=16, help='batch size for dataloader')
     args = parser.parse_args()
 
-    net = get_network(args)
-
-    cifar100_test_loader = get_test_dataloader(
+    if args.gpu == '-1':
+        device = 'cpu'
+    else:
+        if torch.cuda.is_available():
+            device = f'cuda:{args.gpu}'
+            os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+        else:
+            raise ValueError('GPU is not available. please set `--gpu -1` to use cpu only. ')
+
+    net = get_network(args.net).to(device)
+
+    _, cifar100_test_loader = get_dataloader(
         settings.CIFAR100_TRAIN_MEAN,
         settings.CIFAR100_TRAIN_STD,
-        #settings.CIFAR100_PATH,
+        rank=0,
         num_workers=4,
-        batch_size=args.b,
+        batch_size=args.batch,
+        const_test_batch=False
     )
 
-    net.load_state_dict(torch.load(args.weights))
+    state_dict = torch.load(args.weights, weights_only=True)
+    new_state_dict = OrderedDict()
+
+    # If training and saving model with DDP, it's necessary to remove the prefix `module.`
+    for k, v in state_dict.items():
+        if k.startswith('module.'):
+            new_state_dict[k[7:]] = v
+        else:
+            new_state_dict[k] = v
+
+    net.load_state_dict(new_state_dict)
     print(net)
     net.eval()
 
@@ -50,12 +69,7 @@
         for n_iter, (image, label) in enumerate(cifar100_test_loader):
             print("iteration: {}\ttotal {} iterations".format(n_iter + 1, len(cifar100_test_loader)))
 
-            if args.gpu:
-                image = image.cuda()
-                label = label.cuda()
-                print('GPU INFO.....')
-                print(torch.cuda.memory_summary(), end='')
-
+            image, label = image.to(device), label.to(device)
 
             output = net(image)
             _, pred = output.topk(5, 1, largest=True, sorted=True)
@@ -69,7 +83,7 @@
             #compute top1
             correct_1 += correct[:, :1].sum()
 
-    if args.gpu:
+    if device != 'cpu':
         print('GPU INFO.....')
         print(torch.cuda.memory_summary(), end='')