Merge pull request #425 from yinhaofeng/collective_train

seemingwang · web-flow · commit 31a316f050c5 · 2021-05-13T11:59:22.000+08:00
collective_train
diff --git a/doc/collective_mode.md b/doc/collective_mode.md
@@ -0,0 +1,59 @@
+# Collective模式运行
+如果您希望可以同时使用多张GPU，更为快速的训练您的模型，可以尝试使用`单机多卡`或`多机多卡`模式运行。
+
+## 版本要求
+用户需要确保已经安装paddlepaddle-2.0.0-rc-gpu及以上版本的飞桨开源框架
+
+## 设置config.yaml
+首先需要在模型的yaml配置中，加入use_fleet参数，并把值设置成True。  
+同时设置use_gpu为True    
+```yaml
+runner:
+  # 通用配置不再赘述
+  ...
+  # use fleet
+  use_fleet: True
+```
+## 单机多卡训练
+
+### 单机多卡模式下指定需要使用的卡号
+在没有进行设置的情况下将使用单机上所有gpu卡。若需要指定部分gpu卡执行，可以通过设置环境变量CUDA_VISIBLE_DEVICES来实现。  
+例如单机上有8张卡，只打算用前4卡张训练，可以设置export CUDA_VISIBLE_DEVICES=0,1,2,3  
+再执行训练脚本即可。
+
+### 执行训练
+```bash
+# 动态图执行训练
+python -m paddle.distributed.launch ../../../tools/trainer.py -m config.yaml
+# 静态图执行训练
+python -m paddle.distributed.launch ../../../tools/static_trainer.py -m config.yaml
+```
+
+注意：在使用静态图训练时，确保模型static_model.py程序中create_optimizer函数设置了分布式优化器。
+```python
+def create_optimizer(self, strategy=None):
+    optimizer = paddle.optimizer.Adam(learning_rate=self.learning_rate, lazy_mode=True)
+    # 通过Fleet API获取分布式优化器，将参数传入飞桨的基础优化器
+    if strategy != None:
+        import paddle.distributed.fleet as fleet
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+    optimizer.minimize(self._cost)
+```
+
+## 多机多卡训练
+使用多机多卡训练，您需要另外一台或多台能够互相ping通的机器。每台机器中都需要安装paddlepaddle-2.0.0-rc-gpu及以上版本的飞桨开源框架，同时将需要运行的paddlerec模型，数据集复制到每一台机器上。
+- 首先确保各个节点之间是联通的，相互之间通过IP可访问
+- 在每个节点上都需要持有代码与数据
+- 在每个节点上执行命令  
+从单机多卡到多机多卡训练，在代码上不需要做任何改动，只需再额外指定ips参数即可。其内容为多机的ip列表，命令如下所示：
+```bash
+# 动态图
+# 动态图执行训练
+python -m paddle.distributed.launch --ips="xx.xx.xx.xx,yy.yy.yy.yy" --gpus 0,1,2,3,4,5,6,7 ../../../tools/trainer.py -m config.yaml
+# 静态图执行训练
+python -m paddle.distributed.launch --ips="xx.xx.xx.xx,yy.yy.yy.yy" --gpus 0,1,2,3,4,5,6,7 ../../../tools/static_trainer.py -m config.yaml
+```
+
+## 修改reader
+目前我们paddlerec模型默认使用的reader都是继承自paddle.io.IterableDataset，在reader的__iter__函数中拆分文件，按行处理数据。当 paddle.io.DataLoader 中 num_workers > 0 时，每个子进程都会遍历全量的数据集返回全量样本，所以数据集会重复 num_workers 次，也就是每张卡都会获得全部的数据。您在训练时可能需要调整学习率等参数以保证训练效果。  
+如果需要数据集样本不会重复，可通过 [paddle.io.get_worker_info](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/dataloader/dataloader_iter/get_worker_info_cn.html#get-worker-info) 获取各子进程的信息。并在 __iter__ 函数中划分各子进程的数据。[paddle.io.IterableDataset](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/dataloader/dataset/IterableDataset_cn.html#iterabledataset)的相关信息以及划分数据的示例可以点击这里获取。
diff --git a/doc/yaml.md b/doc/yaml.md
@@ -24,6 +24,7 @@
 |             use_inference            |    bool    |                           True/False                           |    否    |                     是否使用save_inference_model接口保存                      |
 |             save_inference_feed_varnames         |    list[string]    |                      组网中指定Variable的name                      |    否    |                     预测模型的入口变量name                     |
 |             save_inference_fetch_varnames         |    list[string]    |                      组网中指定Variable的name                      |    否    |                     预测模型的出口变量name                     |
+|             use_fleet         |    bool    |                      True/False                      |    否    |                     指定是否使用分布式运行单机多卡或多机多卡                     |
 
 
 ## hyper_parameters变量
diff --git a/models/rank/wide_deep/config.yaml b/models/rank/wide_deep/config.yaml
@@ -19,7 +19,7 @@ runner:
   train_reader_path: "criteo_reader" # importlib format
   use_gpu: False
   use_auc: True
-  train_batch_size: 2
+  train_batch_size: 50
   epochs: 3
   print_interval: 2
   # model_init_path: "output_model_wide_deep/2" # init model
@@ -34,6 +34,8 @@ runner:
   use_inference: False
   save_inference_feed_varnames: ["C1","C2","C3","C4","C5","C6","C7","C8","C9","C10","C11","C12","C13","C14","C15","C16","C17","C18","C19","C20","C21","C22","C23","C24","C25","C26","dense_input"]
   save_inference_fetch_varnames: ["sigmoid_0.tmp_0"]
+  #use fleet
+  use_fleet: False
 
 # hyper parameters of user-defined network
 hyper_parameters:
diff --git a/tools/infer.py b/tools/infer.py
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import paddle
 import os
 import paddle.nn as nn
@@ -68,6 +55,10 @@ def main(args):
         for parameter in args.opt:
             parameter = parameter.strip()
             key, value = parameter.split("=")
+            if type(config.get(key)) is int:
+                value = int(value)
+            if type(config.get(key)) is bool:
+                value = (True if value.lower() == "true" else False)
             config[key] = value
 
     # tools.vars
diff --git a/tools/static_infer.py b/tools/static_infer.py
@@ -53,6 +53,10 @@ def main(args):
         for parameter in args.opt:
             parameter = parameter.strip()
             key, value = parameter.split("=")
+            if type(config.get(key)) is int:
+                value = int(value)
+            if type(config.get(key)) is bool:
+                value = (True if value.lower() == "true" else False)
             config[key] = value
     # load static model class
     static_model_class = load_static_model_class(config)
diff --git a/tools/static_trainer.py b/tools/static_trainer.py
@@ -55,6 +55,10 @@ def main(args):
         for parameter in args.opt:
             parameter = parameter.strip()
             key, value = parameter.split("=")
+            if type(config.get(key)) is int:
+                value = int(value)
+            if type(config.get(key)) is bool:
+                value = (True if value.lower() == "true" else False)
             config[key] = value
     # load static model class
     static_model_class = load_static_model_class(config)
@@ -63,9 +67,9 @@ def main(args):
     input_data_names = [data.name for data in input_data]
 
     fetch_vars = static_model_class.net(input_data)
+
     #infer_target_var = model.infer_target_var
     logger.info("cpu_num: {}".format(os.getenv("CPU_NUM")))
-    static_model_class.create_optimizer()
 
     use_gpu = config.get("runner.use_gpu", True)
     use_auc = config.get("runner.use_auc", False)
@@ -79,6 +83,7 @@ def main(args):
     model_init_path = config.get("runner.model_init_path", None)
     batch_size = config.get("runner.train_batch_size", None)
     reader_type = config.get("runner.reader_type", "DataLoader")
+    use_fleet = config.get("runner.use_fleet", False)
     os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1))
     logger.info("**************common.configs**********")
     logger.info(
@@ -88,6 +93,16 @@ def main(args):
     logger.info("**************common.configs**********")
 
     place = paddle.set_device('gpu' if use_gpu else 'cpu')
+
+    if use_fleet:
+        from paddle.distributed import fleet
+        strategy = fleet.DistributedStrategy()
+        fleet.init(is_collective=True, strategy=strategy)
+    if use_fleet:
+        static_model_class.create_optimizer(strategy)
+    else:
+        static_model_class.create_optimizer()
+
     exe = paddle.static.Executor(place)
     # initialize
     exe.run(paddle.static.default_startup_program())
@@ -132,11 +147,20 @@ def main(args):
         else:
             logger.info("reader type wrong")
 
-        save_static_model(
-            paddle.static.default_main_program(),
-            model_save_path,
-            epoch_id,
-            prefix='rec_static')
+        if use_fleet:
+            trainer_id = paddle.distributed.get_rank()
+            if trainer_id == 0:
+                save_static_model(
+                    paddle.static.default_main_program(),
+                    model_save_path,
+                    epoch_id,
+                    prefix='rec_static')
+        else:
+            save_static_model(
+                paddle.static.default_main_program(),
+                model_save_path,
+                epoch_id,
+                prefix='rec_static')
 
         if use_inference:
             feed_var_names = config.get("runner.save_inference_feed_varnames",
diff --git a/tools/to_static.py b/tools/to_static.py
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import paddle
 import os
 import paddle.nn as nn
diff --git a/tools/trainer.py b/tools/trainer.py
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import paddle
 import os
 import paddle.nn as nn
@@ -68,6 +55,10 @@ def main(args):
         for parameter in args.opt:
             parameter = parameter.strip()
             key, value = parameter.split("=")
+            if type(config.get(key)) is int:
+                value = int(value)
+            if type(config.get(key)) is bool:
+                value = (True if value.lower() == "true" else False)
             config[key] = value
 
     # tools.vars
@@ -79,6 +70,7 @@ def main(args):
     train_batch_size = config.get("runner.train_batch_size", None)
     model_save_path = config.get("runner.model_save_path", "model_output")
     model_init_path = config.get("runner.model_init_path", None)
+    use_fleet = config.get("runner.use_fleet", False)
 
     logger.info("**************common.configs**********")
     logger.info(
@@ -102,6 +94,14 @@ def main(args):
     # to do : add optimizer function
     optimizer = dy_model_class.create_optimizer(dy_model, config)
 
+    # use fleet run collective
+    if use_fleet:
+        from paddle.distributed import fleet
+        strategy = fleet.DistributedStrategy()
+        fleet.init(is_collective=True, strategy=strategy)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        dy_model = fleet.distributed_model(dy_model)
+
     logger.info("read data")
     train_dataloader = create_data_loader(config=config, place=place)
 
@@ -186,8 +186,18 @@ def main(args):
                     tensor_print_str + " epoch time: {:.2f} s".format(
                         time.time() - epoch_begin))
 
-        save_model(
-            dy_model, optimizer, model_save_path, epoch_id, prefix='rec')
+        if use_fleet:
+            trainer_id = paddle.distributed.get_rank()
+            if trainer_id == 0:
+                save_model(
+                    dy_model,
+                    optimizer,
+                    model_save_path,
+                    epoch_id,
+                    prefix='rec')
+        else:
+            save_model(
+                dy_model, optimizer, model_save_path, epoch_id, prefix='rec')
 
 
 if __name__ == '__main__':