Add huggingface gpt converter. (#859)

ZHUI · web-flow · commit eee6be98dbab · 2021-08-06T23:30:06.000+08:00
* Add huggingface gpt converter.

* update some doc.

* refine

* refine
diff --git a/examples/language_model/gpt/README.md b/examples/language_model/gpt/README.md
@@ -9,6 +9,7 @@ GPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupe
 .
 ├── args.py                 # 训练参数配置
 ├── create_pretraining_data.py         # 数据预处理脚本
+├── converter.py            # 权重转化脚本
 ├── dataset.py              # 数据处理
 ├── decompress.sh           # 数据集解压脚本
 ├── deploy/                 # 模型部署的inference脚本
@@ -219,12 +220,9 @@ python deploy/python/inference.py --model_type gpt \
 
 用户可以看到屏幕输出预测结果。
 
-## 飞桨4D混合并行训练
-飞桨4D混合并行，使用sharding、模型并行、流水线并行和数据并行策略，使得训练千亿参数规模的模型成为可能。在本示例中，我们提供了基于飞桨最新混合并行策略的GPT预训练模型。运行下面脚本，即可进行模型预训练：
-```shell
-sh scripts/run_static.sh
-```
-用户可以根据自己的机器资源，灵活调整并行策略，选择最合适的策略来训练模型。更多关于混合并行策略的的例子详见[飞桨4D混合并行训练使用指南](https://fleet-x.readthedocs.io/en/latest/paddle_fleet_rst/collective/collective_mp/hybrid_parallelism.html)
+## 其他
+
+本项目提供了Huggingface的权重转化示例`converter.py`，`python xxx-gpt.bin`即可完成转换。用户可以参考转化脚本，转换自己需要的模型权重。
 
 ## 参考文献
 - [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
diff --git a/examples/language_model/gpt/converter.py b/examples/language_model/gpt/converter.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+import torch
+import numpy as np
+
+paddle.set_device("cpu")
+
+model = torch.load(sys.argv[1], map_location='cpu')
+
+print("The origin model keys:")
+for x in sorted(list(model.keys())):
+    print(x)
+
+state = {}
+for sub_name, sub_param in model.items():
+    if sub_name.startswith("transformer"):
+        sub_name = sub_name[12:-1]
+    if sub_name.startswith("h."):
+        final_name = sub_name.replace("h.", "gpt.decoder.layers.")
+    else:
+        final_name = sub_name
+    state[final_name] = sub_param.numpy()
+
+
+def trans_name(key):
+    k = key
+    k = k.replace("mlp.c_fc", "linear1")
+    k = k.replace("mlp.c_proj", "linear2")
+    k = k.replace("attn.c_proj", "self_attn.out_proj")
+    k = k.replace("ln_1", "norm1")
+    k = k.replace("ln_2", "norm2")
+    k = k.replace("ln_f", "gpt.decoder.norm")
+    k = k.replace("wte", "gpt.embeddings.word_embeddings")
+    k = k.replace("wpe", "gpt.embeddings.position_embeddings")
+    return k
+
+
+new_state_dict = {}
+all_num = 0
+for key in sorted(list(state.keys())):
+    all_num += state[key].size
+    new_key = trans_name(key)
+    if "attn.c_attn" in key:
+        shape = state[key].shape
+        print(shape)
+        if "weight" in key:
+            q, k, v = np.split(state[key], 3, axis=1)
+        else:
+            print("BIAS SHAPE", state[key].shape, state[key].transpose().shape)
+            q, k, v = np.split(state[key], 3, axis=-1)
+            q = q.reshape((-1))
+            k = k.reshape((-1))
+            v = v.reshape((-1))
+        q_name = new_key.replace("attn.c_attn", "self_attn.q_proj")
+        k_name = new_key.replace("attn.c_attn", "self_attn.k_proj")
+        v_name = new_key.replace("attn.c_attn", "self_attn.v_proj")
+        new_state_dict[q_name] = paddle.to_tensor(q, dtype="float32")
+        new_state_dict[k_name] = paddle.to_tensor(k, dtype="float32")
+        new_state_dict[v_name] = paddle.to_tensor(v, dtype="float32")
+        continue
+    new_state_dict[new_key] = paddle.to_tensor(state[key], dtype="float32")
+print("all shape numel:{}".format(all_num))
+for key, value in new_state_dict.items():
+    print("key:{}, shape:{}, dtype:{}".format(key, value.shape, value.dtype))
+
+orgin_path = sys.argv[1]
+if ".bin" in orgin_path:
+    save_path = orgin_path.replace(".bin", ".pdparams")
+else:
+    save_path = os.path.join(orgin_path, ".pdparams")
+paddle.save(new_state_dict, save_path)
diff --git a/examples/language_model/gpt/predict.py b/examples/language_model/gpt/predict.py
@@ -59,7 +59,7 @@ def ask_question_cn(self, question):
 
     def ask_question_en(self, question):
         self.predict(
-            "Question: Where is the capital of China? Answer: Beijing. \nQuestion: %s  "
+            "Question: Where is the capital of China? Answer: Beijing. \n Question:%s Answer:"
             % question)
 
     # dictation poetry