Update ofa example Multi cards startup method (#515)

LiuChiachi · ZeyuChen · web-flow · commit 9c81a4adf52b · 2021-06-17T12:00:15.000+08:00
* add device argument and gpu launch method

* update reamde

* fix readme, and remove args.n_gpu

* fix ofa depth multi gpus bug

* add choice to device arg

Co-authored-by: Zeyu Chen &lt;chenzeyu01@baidu.com&gt;
diff --git a/examples/model_compression/ofa/README.md b/examples/model_compression/ofa/README.md
@@ -218,15 +218,16 @@ python -u ./run_glue.py \
     --logging_steps 1 \
     --save_steps 500 \
     --output_dir ./tmp/$TASK_NAME/ \
-    --n_gpu 1 \
+    --device gpu \
 ```
 参数详细含义参考[README.md](../../benchmark/glue/README.md)
 Fine-tuning 在dev上的结果如压缩结果表1-1中Result那一列所示。
 
 
 ### 压缩训练
 
-```python
+单卡训练
+```shell
 python -u ./run_glue_ofa.py --model_type bert \
           --model_name_or_path ${task_pretrained_model_dir} \
           --task_name $TASK_NAME --max_seq_length 128     \
@@ -236,10 +237,29 @@ python -u ./run_glue_ofa.py --model_type bert \
           --logging_steps 10     \
           --save_steps 100     \
           --output_dir ./tmp/$TASK_NAME \
-          --n_gpu 1 \
+          --device gpu  \
+          --width_mult_list 1.0 0.8333333333333334 0.6666666666666666 0.5
+```
+
+多卡训练
+
+```shell
+unset CUDA_VISIBLE_DEVICES
+python -m paddle.distributed.launch --gpus "0,1" run_glue_ofa.py  \
+          --model_type bert \
+          --model_name_or_path ${task_pretrained_model_dir} \
+          --task_name $TASK_NAME --max_seq_length 128     \
+          --batch_size 32       \
+          --learning_rate 2e-5     \
+          --num_train_epochs 6     \
+          --logging_steps 10     \
+          --save_steps 100     \
+          --output_dir ./tmp/$TASK_NAME \
+          --device gpu  \
           --width_mult_list 1.0 0.8333333333333334 0.6666666666666666 0.5
 ```
 
+
 其中参数释义如下：
 - `model_type` 指示了模型类型，当前仅支持BERT模型。
 - `model_name_or_path` 指示了某种特定配置的模型，对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地，这里也可以提供相应目录地址。
@@ -251,7 +271,7 @@ python -u ./run_glue_ofa.py --model_type bert \
 - `logging_steps` 表示日志打印间隔。
 - `save_steps` 表示模型保存及评估间隔。
 - `output_dir` 表示模型保存路径。
-- `n_gpu` 表示使用的 GPU 卡数。若希望使用多卡训练，将其设置为指定数目即可；若为0，则使用CPU。
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
 - `width_mult_list` 表示压缩训练过程中，对每层Transformer Block的宽度选择的范围。
 
 压缩训练之后在dev上的结果如压缩结果表格中Result with PaddleSlim那一列所示，延时情况如表1-2所示。
@@ -268,7 +288,7 @@ python -u ./export_model.py --model_type bert \
                              --max_seq_length 128     \
                  --sub_model_output_dir ./tmp/$TASK_NAME/dynamic_model \
                              --static_sub_model ./tmp/$TASK_NAME/static_model \
-                 --n_gpu 1 \
+                 --device gpu \
                  --width_mult  0.6666666666666666
 ```
 
@@ -278,7 +298,7 @@ python -u ./export_model.py --model_type bert \
 - `max_seq_length` 表示最大句子长度，超过该长度将被截断。默认：128.
 - `sub_model_output_dir` 指示了导出子模型动态图参数的目录。
 - `static_sub_model` 指示了导出子模型静态图模型及参数的目录，设置为None，则表示不导出静态图模型。默认：None。
-- `n_gpu` 表示使用的 GPU 卡数。若希望使用多卡训练，将其设置为指定数目即可；若为0，则使用CPU。默认：1.
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
 - `width_mult` 表示导出子模型的宽度。默认：1.0.
 
 
diff --git a/examples/model_compression/ofa/run_glue_ofa.py b/examples/model_compression/ofa/run_glue_ofa.py
@@ -148,10 +148,11 @@ def parse_args():
     parser.add_argument(
         "--seed", type=int, default=42, help="random seed for initialization")
     parser.add_argument(
-        "--n_gpu",
-        type=int,
-        default=1,
-        help="number of gpus to use, 0 for cpu.")
+        "--device",
+        default="gpu",
+        type=str,
+        choices=["gpu", "cpu", "xpu"],
+        help="The device to select to train the model, is must be cpu/gpu/xpu.")
     parser.add_argument(
         '--width_mult_list',
         nargs='+',
@@ -317,7 +318,7 @@ def convert_example(example,
 
 
 def do_train(args):
-    paddle.set_device("gpu" if args.n_gpu else "cpu")
+    paddle.set_device(args.device)
     if paddle.distributed.get_world_size() > 1:
         paddle.distributed.init_parallel_env()
 
@@ -497,7 +498,7 @@ def do_train(args):
             optimizer.clear_grad()
 
             if global_step % args.logging_steps == 0:
-                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
+                if paddle.distributed.get_rank() == 0:
                     logger.info(
                         "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                         % (global_step, epoch, step, loss,
@@ -544,8 +545,7 @@ def do_train(args):
                         print("eval done total : %s s" %
                               (time.time() - tic_eval))
 
-                    if (not args.n_gpu > 1
-                        ) or paddle.distributed.get_rank() == 0:
+                    if paddle.distributed.get_rank() == 0:
                         output_dir = os.path.join(args.output_dir,
                                                   "model_%d" % global_step)
                         if not os.path.exists(output_dir):
@@ -570,7 +570,4 @@ def print_arguments(args):
 if __name__ == "__main__":
     args = parse_args()
     print_arguments(args)
-    if args.n_gpu > 1:
-        paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
-    else:
-        do_train(args)
+    do_train(args)
diff --git a/examples/model_compression/ofa/run_glue_ofa_depth.py b/examples/model_compression/ofa/run_glue_ofa_depth.py
@@ -153,10 +153,11 @@ def parse_args():
     parser.add_argument(
         "--seed", type=int, default=42, help="random seed for initialization")
     parser.add_argument(
-        "--n_gpu",
-        type=int,
-        default=1,
-        help="number of gpus to use, 0 for cpu.")
+        "--device",
+        default="gpu",
+        type=str,
+        choices=["gpu", "cpu", "xpu"],
+        help="The device to select to train the model, is must be cpu/gpu/xpu.")
     parser.add_argument(
         '--width_mult_list',
         nargs='+',
@@ -312,7 +313,7 @@ def convert_example(example,
 
 
 def do_train(args):
-    paddle.set_device("gpu" if args.n_gpu else "cpu")
+    paddle.set_device(args.device)
     if paddle.distributed.get_world_size() > 1:
         paddle.distributed.init_parallel_env()
 
@@ -384,8 +385,6 @@ def do_train(args):
     model = model_class.from_pretrained(
         args.model_name_or_path, num_classes=num_labels)
     origin_weights = model.state_dict()
-    if paddle.distributed.get_world_size() > 1:
-        model = paddle.DataParallel(model)
 
     # Step2: Convert origin model to supernet.
     sp_config = supernet(expand_ratio=args.width_mult_list)
@@ -430,6 +429,10 @@ def do_train(args):
     if args.task_name == "mnli":
         dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched)
 
+    if paddle.distributed.get_world_size() > 1:
+        ofa_model.model = paddle.DataParallel(
+            ofa_model.model, find_unused_parameters=True)
+
     if args.max_steps > 0:
         num_training_steps = args.max_steps
         num_train_epochs = math.ceil(num_training_steps /
@@ -487,7 +490,7 @@ def do_train(args):
             ofa_model.model.clear_gradients()
 
             if global_step % args.logging_steps == 0:
-                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
+                if paddle.distributed.get_rank() == 0:
                     logger.info(
                         "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                         % (global_step, epoch, step, loss,
@@ -537,8 +540,7 @@ def do_train(args):
                             print("eval done total : %s s" %
                                   (time.time() - tic_eval))
 
-                        if (not args.n_gpu > 1
-                            ) or paddle.distributed.get_rank() == 0:
+                        if paddle.distributed.get_rank() == 0:
                             output_dir = os.path.join(args.output_dir,
                                                       "model_%d" % global_step)
                             if not os.path.exists(output_dir):
@@ -563,7 +565,4 @@ def print_arguments(args):
 if __name__ == "__main__":
     args = parse_args()
     print_arguments(args)
-    if args.n_gpu > 1:
-        paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
-    else:
-        do_train(args)
+    do_train(args)