[AutoParallel] add sharding opt config (#6124)

zhaoyinglia · web-flow · commit 8c9717a9ece1 · 2023-06-08T16:20:58.000+08:00
diff --git a/model_zoo/gpt-3/ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_13B_sharding8.yaml b/model_zoo/gpt-3/ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_13B_sharding8.yaml
@@ -0,0 +1,39 @@
+_base_: ./pretrain_gpt_base.yaml
+
+Global:
+  global_batch_size: 
+  local_batch_size: 1
+  micro_batch_size: 1
+
+
+Model:
+  vocab_size: 50304
+  hidden_size: 5120
+  num_layers: 40
+  num_attention_heads: 40
+  ffn_hidden_size:
+  hidden_dropout_prob: 0.1
+  attention_probs_dropout_prob: 0.1
+  max_position_embeddings: 1024
+  type_vocab_size: 16
+  initializer_range: 0.02
+  fuse_attn_qkv: True
+  use_recompute: True
+  recompute_granularity:
+  no_recompute_layers:
+
+
+Distributed:
+  dp_degree:
+  mp_degree: 1
+  pp_degree: 1
+  sharding:
+    sharding_degree: 8
+    sharding_stage: 3
+    reduce_overlap: True
+    broadcast_overlap: True
+    param_comm_stream_num: 3
+    grad_comm_stream_num: 3
+    param_bucket_size_numel: 210355872
+    grad_bucket_size_numel: 210355872
+    enable_hierarchical_comm: False
diff --git a/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py b/model_zoo/gpt-3/ppfleetx/models/language_model/gpt/auto/auto_model.py
@@ -93,7 +93,7 @@ def __init__(
         self.ipp = ipp
 
         self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim[{}] must be divisible by num_heads[{}]".format(self.embed_dim, num_heads)
 
         if self.fuse_attn_qkv:
             assert self.kdim == embed_dim
@@ -290,7 +290,8 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, use_cache=False,
         new_caches = []
 
         for i, mod in enumerate(self.layers):
-            mod = auto.shard_op(mod, auto_env.get_mesh()[mod.ipp])
+            ipp = mod.ipp
+            mod = auto.shard_op(mod, auto_env.get_mesh()[ipp])
 
             if cache is None:
                 if use_cache:
@@ -305,6 +306,8 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, use_cache=False,
                 output, new_cache = mod(output, memory, tgt_mask=tgt_mask, use_cache=use_cache, cache=cache[i])
                 new_caches.append(new_cache)
 
+            auto.shard_tensor(output, auto_env.get_mesh()[ipp], [auto_env.get_mesh().dp_dim, None, None])
+
         if self.norm is not None:
             output = self.norm(output)
         return output if use_cache is False else (output, new_caches)
diff --git a/model_zoo/gpt-3/ppfleetx/utils/auto_config.py b/model_zoo/gpt-3/ppfleetx/utils/auto_config.py
@@ -46,6 +46,8 @@ def process_dist_configs(config):
     sharding_config = configs["sharding"]
     sharding_degree = sharding_config.setdefault("sharding_degree", 1)
     sharding_config.setdefault("sharding_stage", 2)
+    sharding_config.setdefault("reduce_overlap", False)
+    sharding_config.setdefault("broadcast_overlap", False)
 
     other_degree = mp_degree * pp_degree
 
@@ -184,6 +186,12 @@ def process_strategy(config):
     sharding.enable = sharding_cfg.get("sharding_degree", 1) > 1
     sharding.degree = sharding_cfg.get("sharding_degree", 1)
     sharding.stage = sharding_cfg.get("sharding_stage", 1)
+    sharding.enable_overlap = sharding_cfg.get("reduce_overlap", False) and sharding_cfg.get("broadcast_overlap", False)
+    sharding.param_comm_stream_num = sharding_cfg.get("param_comm_stream_num", 1)
+    sharding.grad_comm_stream_num = sharding_cfg.get("grad_comm_stream_num", 1)
+    sharding.param_bucket_size_numel = sharding_cfg.get("param_bucket_size_numel", 1)
+    sharding.grad_bucket_size_numel = sharding_cfg.get("grad_bucket_size_numel", 1)
+    sharding.enable_hierarchical_comm = sharding_cfg.get("enable_hierarchical_comm", False)
 
     pp_degree = config["Distributed"]["pp_degree"]
     accumulate_steps = config.Engine.get("accumulate_steps", 1)
diff --git a/model_zoo/gpt-3/projects/gpt/auto_gpt_13B_sharding8.sh b/model_zoo/gpt-3/projects/gpt/auto_gpt_13B_sharding8.sh
@@ -0,0 +1,26 @@
+#! /bin/bash
+# Runs the "1.3B" parameter model
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log_dir=log_auto
+rm -rf $log_dir
+
+# 10B+sharding8 run_pretrain
+python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
+    ./tools/auto.py \
+    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_13B_sharding8.yaml \
+    -o Engine.max_steps=1000 \
+    -o Engine.logging_freq=1 \
+    -o Engine.verbose=3
diff --git a/model_zoo/gpt-3/projects/gpt/auto_gpt_6.7B_sharding16.sh b/model_zoo/gpt-3/projects/gpt/auto_gpt_6.7B_sharding16.sh
@@ -20,4 +20,4 @@ rm -rf $log_dir
 # 6.7B+sharding16 run_pretrain
 python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" \
     ./tools/auto.py \
-    -c ./ppfleetx/configs/nlp/gp/auto/pretrain_gpt_6.7B_sharding16.yaml
+    -c ./ppfleetx/configs/nlp/gpt/auto/pretrain_gpt_6.7B_sharding16.yaml