gpt fused attention and feedforward (#2277)

wangxicoding · web-flow · commit 1a8c09913972 · 2022-05-31T23:51:06.000+08:00
* gpt add fuse attn ffn

* add fuse args

* update doc

* use FusedFeedForward and FusedMultiHeadAttention from paddle

* pre-commit
diff --git a/examples/language_model/gpt-3/deploy/README.md b/examples/language_model/gpt-3/deploy/README.md
@@ -1,11 +1,98 @@
 ## 超大模型部署
 
-TBD
+目录：
+- [简介](#简介)
+- [环境安装](#环境安装)
+- [模型导出](#模型导出)
+- [自动切分](#自动切分)
+- [推理部署](#推理部署)
+
+
+
+超大模型由于参数容量大、显存/内容占用较多，对如何高效推理提出挑战。飞桨推出了针对分布式推理、大模型的压缩、服务化全流程部署方案。 其中分布式推理采用[张量模型并行、流水线并行技术](https://fleet-x.readthedocs.io/en/latest/paddle_fleet_rst/distributed_introduction.html)，这些技术通用用于超大模型训练。推理场景与训练场景的不同点包括: 硬件特性不同、硬件数量不同、通信环境不同。为了充分利用推理硬件效率，将飞桨自适应并行训练技术应用于推理场景，针对推理硬件拓扑、环境进行自适应的切分，进行自适应并行推理。
+
+模型压缩旨在提升推理效率、节约部署资源，飞桨模型压缩工具PaddleSlim包含丰富的压缩方法，诸如量化、稀疏化技术，这些技术不仅可以使得模型容量大大减少，从而节约部署硬件数量，还可以降低推理时延，提升吞吐。在大模型压缩上，依然存在挑战。从算法上，超大模型通常层数较深，如量化误差会累积越大，稀疏化要求的大稀疏度下，很容易出现精度损失；从压缩工具上，由于超大模型显存占用较多，模型压缩工具也需要适配训练并行技术，在张量模型并行、Sharding并行、流水线并行的基础上支持量化缩放系数的统计，支持稀疏掩码训练等；从推理效率与精度平衡上，量化依据量化对象，分为了仅权重量化、权重/激活均量化，依据量化的Bit数，包括8Bit、4Bit等，依据量化的位置，分为部分量化、全量化，稀疏也分为非结构化稀疏、半结构化稀疏等，需依据精度和推理速度、显存/内存权衡选取策略，并且需要在分布式推理的基础上支持量化、稀疏推理。
+
+
+
+超大模型云端部署，飞桨还提供了PaddleServing服务化支持，可以使得用户比较容易部署到多机多卡上，对于服务请求自动进行Batch处理、容错调度等。
+
+
+
+本教程以GPT-3为例介绍如何进行超大模型部署，下面重点介绍模型导出、自动切分、推理部署，模型压缩内容后续会提供。服务化部署教程，采用其他预训练模型进行介绍。
+
+
+
+### 环境安装
+
+版本依赖如下:
+
+Paddle: >= 2.3.0
+
+PaddleNLP: develop分支
+
+
+
+由于此前飞桨发布的Python安装包，不包含分布式推理功能，需要源码编译，后续会优化此步骤。参考[源码编译教程](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/linux-compile.html) ，需要安装NCCL，编译命令设置参考如下，注意设置 `-DWITH_DISTRIBUTE=ON`
+
+```
+cmake .. -DPY_VERSION=3.7 -DWITH_GPU=ON -DWITH_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -DWITH_DISTRIBUTE=ON
+```
+
+
 
 ### 模型导出
 
-### 自动切分
+在PaddleNLP，GTP-3提供了静态图、动态图训练，本次教程基于静态图训练代码进行，后续提供动态图支持。
+
+首先，下载PaddleNLP:
+
+```
+git clone https://github.com/PaddlePaddle/PaddleNLP.git
+cd PaddleNLP/examples/language_model/gpt-3/static/
+```
+
+推理模型导出，下面脚本默认是张量模型并行度为1，依据想要部署的GPU卡数，设置
+
+```
+run_gen.sh
+```
+
+关键参数介绍如下，也可以通过 `python run_generation.py --help` 查看参数列表和设置帮助信息。
+
+- gpus:  设置GPU个数，也就是并行数
+- model_type: 设置模型类型
+- mp_degree: 张量模型并行度
+- max_seq_len: 输入字长
+- max_dec_len: 输出字长
+
+注意: 自动切分时，不需要提供设置mp_degree，后续会补充自动切分内容。
+
+运行`bash run_gen.sh`，模型会导出到当前目录。mp_degree为1时导出模型为`inference_model_pp1mp1`，mp_degree为2时导出模型为`inference_model_pp1mp1`
+
 
 ### 推理部署
+推理部署前，参考前面的模型导出步骤，确保已导出好模型。
+```
+cd PaddleNLP/examples/language_model/gpt-3/static/
+bash run_gen.sh  # 导出模型
+```
+导出好模型后，即可使用高性能推理脚本进行推理。以两卡张量模型并行为例，通过`model_path`指定导出的模型目录，
+使用如下命令便可以基于Paddle Inference进行高性能预测：
+```
+cd ../deploy/python
+python -m paddle.distributed.launch \
+        --gpus 0,1 \
+        inference.py --model_type gpt \
+        --model_path ../../static/inference_model_pp1mp2/
+```
+
+
+#### 服务化部署
+TBD
+
+
+
 
 ### Benchmark
+TBD
diff --git a/examples/language_model/gpt-3/static/args.py b/examples/language_model/gpt-3/static/args.py
@@ -293,9 +293,14 @@ def parse_args(MODEL_CLASSES):
         help="The hyper-parameter in beam search.")
     parser.add_argument(
         "--save_inference_model_then_exist",
-        type=bool,
+        type=str2bool,
         default=False,
         help="save_inference_model_then_exist")
+    parser.add_argument(
+        "--fuse",
+        type=str2bool,
+        default=False,
+        help="Whether to enable fused_attention and fused_feedforward.")
 
     args = parser.parse_args()
     args.test_iters = args.eval_iters * 10
diff --git a/examples/language_model/gpt-3/static/modeling.py b/examples/language_model/gpt-3/static/modeling.py
@@ -364,6 +364,8 @@ class TransformerDecoderLayer(nn.Layer):
     It contains multiheadattention and some linear layers.
     """
 
+    Cache = collections.namedtuple("Cache", ["kv"])
+
     def __init__(self,
                  d_model,
                  nhead,
@@ -375,7 +377,8 @@ def __init__(self,
                  normalize_before=True,
                  weight_attr=None,
                  bias_attr=None,
-                 topo=None):
+                 topo=None,
+                 **kwargs):
         self._config = locals()
         self._config.pop("self")
         self._config.pop("__class__", None)  # py3
@@ -388,45 +391,94 @@ def __init__(self,
         weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
 
-        self.self_attn = MultiHeadAttention(
-            d_model,
-            nhead,
-            dropout=attn_dropout,
-            weight_attr=weight_attrs[0],
-            bias_attr=bias_attrs[0],
-            topo=topo)
-        if topo is None or topo.mp_info.size == 1:
-            self.linear1 = nn.Linear(
+        self._fuse = kwargs.get('fuse', False)
+        if self._fuse:
+            nranks, ring_id = 1, -1
+            if topo is not None and topo.mp_info.size > 1:
+                nranks = topo.mp_info.size
+                ring_id = 0
+            self.self_attn = incubate.nn.FusedMultiHeadAttention(
                 d_model,
-                dim_feedforward,
-                weight_attrs[2],
-                bias_attr=bias_attrs[2])
-            self.linear2 = nn.Linear(
-                dim_feedforward,
+                nhead,
+                dropout_rate=dropout,
+                attn_dropout_rate=attn_dropout,
+                normalize_before=normalize_before,
+                qkv_weight_attr=weight_attrs[0],
+                qkv_bias_attr=bias_attrs[0],
+                linear_weight_attr=weight_attrs[0],
+                linear_bias_attr=bias_attrs[0],
+                epsilon=1e-5,
+                nranks=nranks,
+                ring_id=ring_id)
+            self.ffn = incubate.nn.FusedFeedForward(
                 d_model,
-                weight_attrs[2],
-                bias_attr=bias_attrs[2])
+                dim_feedforward,
+                dropout_rate=act_dropout,
+                epsilon=1e-5,
+                activation=activation,
+                normalize_before=normalize_before,
+                act_dropout_rate=0.0,
+                linear1_weight_attr=weight_attrs[2],
+                linear1_bias_attr=bias_attrs[2],
+                linear2_weight_attr=weight_attrs[2],
+                linear2_bias_attr=bias_attrs[2],
+                nranks=nranks,
+                ring_id=ring_id)
         else:
-            self.linear1 = paddlenlp.ops.ColumnParallelLiner(
-                (d_model, dim_feedforward),
-                topo.mp_info.size,
-                gather_out=False,
-                param_attr=weight_attrs[2],
-                bias_attr=bias_attrs[2])
-            self.linear2 = paddlenlp.ops.RowParallelLiner(
-                (dim_feedforward, d_model),
-                topo.mp_info.size,
-                input_is_parallel=True,
-                param_attr=weight_attrs[2],
-                bias_attr=bias_attrs[2])
+            self.self_attn = MultiHeadAttention(
+                d_model,
+                nhead,
+                dropout=attn_dropout,
+                weight_attr=weight_attrs[0],
+                bias_attr=bias_attrs[0],
+                topo=topo)
+            if topo is None or topo.mp_info.size == 1:
+                self.linear1 = nn.Linear(
+                    d_model,
+                    dim_feedforward,
+                    weight_attrs[2],
+                    bias_attr=bias_attrs[2])
+                self.linear2 = nn.Linear(
+                    dim_feedforward,
+                    d_model,
+                    weight_attrs[2],
+                    bias_attr=bias_attrs[2])
+            else:
+                self.linear1 = paddlenlp.ops.ColumnParallelLiner(
+                    (d_model, dim_feedforward),
+                    topo.mp_info.size,
+                    gather_out=False,
+                    param_attr=weight_attrs[2],
+                    bias_attr=bias_attrs[2])
+                self.linear2 = paddlenlp.ops.RowParallelLiner(
+                    (dim_feedforward, d_model),
+                    topo.mp_info.size,
+                    input_is_parallel=True,
+                    param_attr=weight_attrs[2],
+                    bias_attr=bias_attrs[2])
 
-        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
-        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
-        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
-        self.activation = getattr(F, activation)
+            self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
+            self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
+            self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+            self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
+            self.activation = getattr(F, activation)
 
     def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
+        if self._fuse:
+            if isinstance(cache, self.Cache):
+                attn_output, cache_kv_out = self.self_attn(
+                    tgt, attn_mask=tgt_mask, cache=cache.kv)
+
+                ## if not assign here, update caches in While loop
+                # layers.assign(cache_kv_out, cache.kv)
+                if use_cache:
+                    cache = self.Cache(cache_kv_out)
+            else:
+                attn_output = self.self_attn(tgt, attn_mask=tgt_mask)
+
+            enc_out = self.ffn(attn_output)
+            return (enc_out, cache) if use_cache else enc_out
+
         residual = tgt
 
         if self.normalize_before:
@@ -687,7 +739,8 @@ def __init__(self,
                  eos_token_id=7,
                  bos_token_id=0,
                  eol_token_id=3,
-                 topo=None):
+                 topo=None,
+                 **kwargs):
         super(GPTModel, self).__init__()
 
         self.pad_token_id = pad_token_id
@@ -727,7 +780,8 @@ def __init__(self,
                         initializer=nn.initializer.Normal(
                             mean=0.0, std=self.initializer_range)),
                     bias_attr=None,
-                    topo=topo))
+                    topo=topo,
+                    fuse=kwargs.get('fuse', False)))
 
         if self.pipline_mode:
             Decoder = paddlenlp.ops.guard('gpu:{}'.format(
@@ -866,7 +920,8 @@ def __init__(self,
                  temperature=1.0,
                  top_k=0,
                  top_p=1.0,
-                 eos_id=None):
+                 eos_id=None,
+                 **kwargs):
         super(GPTForGeneration, self).__init__()
         self.gpt = gpt
         self.apply(self.init_weights)
@@ -879,32 +934,43 @@ def __init__(self,
         self.temperature = temperature
         self.topk = top_k
         self.topp = top_p
-        self._fuse = False
         self._init_gen_cache = False
-        self.generation_caches = []
+        self.generation_caches = None
         self._dtype = "float32"
+        self._fuse = kwargs.get("fuse", False)
 
     def _init_generation_caches(self, src_ids):
-        if self._init_gen_cache:
+        # not fuse, return None
+        if self._init_gen_cache or self._fuse is False:
             return self.generation_caches
 
+        self.generation_caches = []
         num_heads = self.gpt.num_attention_heads
         num_layers = self.gpt.num_hidden_layers
         mp_n_head = num_heads // self.gpt.topo.mp_info.size
         hidden_size = self.gpt.hidden_size
         head_size = hidden_size // num_heads
         for i in range(num_layers):
-            k = layers.fill_constant_batch_size_like(
-                input=src_ids,
-                shape=[-1, mp_n_head, 0, head_size],
-                dtype=self._dtype,
-                value=0)
-            v = layers.fill_constant_batch_size_like(
-                input=src_ids,
-                shape=[-1, mp_n_head, 0, head_size],
-                dtype=self._dtype,
-                value=0)
-            self.generation_caches.append(MultiHeadAttention.Cache(k, v))
+            if self._fuse:
+                kv = layers.fill_constant_batch_size_like(
+                    input=src_ids,
+                    shape=[2, -1, mp_n_head, 0, head_size],
+                    dtype=self._dtype,
+                    value=0,
+                    output_dim_idx=1)
+                self.generation_caches.append(TransformerDecoderLayer.Cache(kv))
+            else:
+                k = layers.fill_constant_batch_size_like(
+                    input=src_ids,
+                    shape=[-1, mp_n_head, 0, head_size],
+                    dtype=self._dtype,
+                    value=0)
+                v = layers.fill_constant_batch_size_like(
+                    input=src_ids,
+                    shape=[-1, mp_n_head, 0, head_size],
+                    dtype=self._dtype,
+                    value=0)
+                self.generation_caches.append(MultiHeadAttention.Cache(k, v))
         self._init_gen_cache = True
         return self.generation_caches
 
@@ -1011,10 +1077,14 @@ def forward(self, inputs, use_cache=False, cache=None):
 
         # if cached_kvs are assigned to next step in _prepare_qkv of MultiHeadAttention,
         # need to init the global caches here
-        #gen_caches = self._init_generation_caches(input_ids)
+        gen_caches = self._init_generation_caches(input_ids)
 
         logits, cached_kvs = self.model(
-            input_ids, position_ids, encode_mask, use_cache=True)
+            input_ids,
+            position_ids,
+            encode_mask,
+            use_cache=True,
+            cache=gen_caches)
 
         next_id = paddle.argmax(logits[:, -1, :], axis=-1).reshape([-1, 1])
         ####################################
@@ -1092,7 +1162,10 @@ def forward(self, inputs, use_cache=False, cache=None):
             paddle.assign(layers.cast(cond, dtype='bool'), cond)
             if attention_mask:
                 paddle.assign(decode_mask, attention_mask)
-                for i in range(len(decode_cached_kvs)):
+            for i in range(len(decode_cached_kvs)):
+                if self._fuse:
+                    paddle.assign(decode_cached_kvs[i].kv, cached_kvs[i].kv)
+                else:
                     paddle.assign(decode_cached_kvs[i].k, cached_kvs[i].k)
                     paddle.assign(decode_cached_kvs[i].v, cached_kvs[i].v)
 
diff --git a/examples/language_model/gpt-3/static/run_generation.py b/examples/language_model/gpt-3/static/run_generation.py
@@ -203,14 +203,16 @@ def do_generation(args):
                     model_config[
                         "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob
                     model_config["topo"] = topo
+                    model_config["fuse"] = args.fuse
                     model = GPTForGeneration(
                         GPTModel(**model_config),
                         max_length=args.max_dec_len,
                         decoding_strategy=args.decoding_strategy,
                         temperature=args.temperature,
                         top_k=args.topk,
                         top_p=args.topp,
-                        eos_id=eos_id)
+                        eos_id=eos_id,
+                        fuse=args.fuse)
                 else:
                     logger.error("No checkpoint load.")
                 model.eval()