[New features] add roberta & gpt conversion (#4407)

wj-Mcat · web-flow · commit b2a24c65df6e · 2023-01-11T21:07:58.000+08:00
* add roberta &amp; gpt conversion

* update gpt model

* revert roberta related files

* update gpt loading

* update requirements

* fix input_ids
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -18,6 +18,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
+          pip install -r tests/requirements.txt
           make install
       - name: run the command
         run: make test
diff --git a/paddlenlp/transformers/conversion_utils.py b/paddlenlp/transformers/conversion_utils.py
@@ -255,7 +255,7 @@ def should_merge_last_two_dim(self) -> bool:
         """check that wether merge last two dim"""
         return self.action == "merge_last_two_dim"
 
-    def run(self, tensor: ndarray) -> ndarray:
+    def run(self, state_dict: dict[str, ndarray], name: str) -> ndarray:
         """run some custom operation on ndarray, eg: transpose, merge_last_two_dim
 
         Args:
@@ -264,12 +264,21 @@ def run(self, tensor: ndarray) -> ndarray:
         Returns:
             ndarray: the final tensor
         """
+        tensor = state_dict.pop(name)
         if self.action == "transpose":
             return transpose(tensor, [1, 0])
         if self.action == "merge_last_two_dim":
             shape = tensor.shape
             assert len(shape) == 3
             return np.reshape(tensor, [shape[0], -1])
+        if self.action == "split":
+            assert self.index is not None, "when action is `split`, index field is required."
+
+            if self.index < 2:
+                state_dict[name] = tensor
+            # qkv is stored in same tensor, so it should be split into 3 arr
+            tensors = np.split(tensor, 3, axis=-1)
+            return tensors[self.index]
         return tensor
 
     def matched(self, text: str) -> bool:
@@ -490,6 +499,9 @@ class LogitComparer:
     config_fields_to_be_removed: List[str] = ["transformers_version"]
     architectures: Dict[str, Type[PretrainedModel]] = {}
 
+    def __init__(self, input_dir: str) -> None:
+        self.input_dir = input_dir
+
     def get_paddle_pytorch_model_classes(self) -> Tuple[object, object]:
         """return the [PaddleModelClass, PytorchModelClass] to
             1. generate paddle model automatically
@@ -574,13 +586,15 @@ def compare_model_state_dicts(
         for name_mapping in name_mappings:
             model_state_saver.add(name_mapping.target_name, "pytorch_key", name_mapping.source_name)
 
-            paddle_numpy = paddle_state_dict.pop(name_mapping.target_name)
-            model_state_saver.add(name_mapping.target_name, "paddle", paddle_numpy)
-            model_state_saver.add(name_mapping.target_name, "paddle-shape", str(paddle_numpy.shape))
+            if name_mapping.target_name in paddle_state_dict:
+                paddle_numpy = paddle_state_dict.pop(name_mapping.target_name)
+                model_state_saver.add(name_mapping.target_name, "paddle", paddle_numpy)
+                model_state_saver.add(name_mapping.target_name, "paddle-shape", str(paddle_numpy.shape))
 
-            pytorch_numpy = pytorch_state_dict.pop(name_mapping.source_name)
-            model_state_saver.add(name_mapping.target_name, "pytorch", pytorch_numpy)
-            model_state_saver.add(name_mapping.target_name, "pytorch-shape", str(pytorch_numpy.shape))
+            if name_mapping.source_name in pytorch_state_dict:
+                pytorch_numpy = pytorch_state_dict.pop(name_mapping.source_name)
+                model_state_saver.add(name_mapping.target_name, "pytorch", pytorch_numpy)
+                model_state_saver.add(name_mapping.target_name, "pytorch-shape", str(pytorch_numpy.shape))
 
         model_state_saver.summary()
 
@@ -594,8 +608,7 @@ def compare_logits(self) -> bool:
         paddle_model = PaddleModel.from_pretrained(self.input_dir)
 
         # 0. init the name_mapping & tensor_info_saver & logit_hooker
-        num_layers = self.get_num_layer(list(paddle_model.state_dict().keys()))
-        name_mappings = self.get_name_mapping(num_layers, paddle_model.config["architectures"])
+        name_mappings = self.get_name_mapping(paddle_model.config)
         tensor_info_saver = TensorInfoSaver()
 
         logit_hooker = LogitHooker(name_mappings, tensor_info_saver)
@@ -707,8 +720,9 @@ def convert(cls, weight_file: str, config: PretrainedConfig, cache_dir: str) ->
                 logger.warning(f"key<{name_mapping.source_name}> not in the pytorch weight file.")
                 continue
 
-            state_dict[name_mapping.target_name] = name_mapping.run(state_dict.pop(name_mapping.source_name))
-            all_layer_names.remove(name_mapping.source_name)
+            state_dict[name_mapping.target_name] = name_mapping.run(state_dict, name_mapping.source_name)
+            if name_mapping.source_name in all_layer_names:
+                all_layer_names.remove(name_mapping.source_name)
 
         if all_layer_names:
             logger.warning(f"there are {len(all_layer_names)} tensors not initialized:")
diff --git a/paddlenlp/transformers/gpt/configuration.py b/paddlenlp/transformers/gpt/configuration.py
@@ -240,7 +240,18 @@ class GPTConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
     model_type = "gpt"
-    attribute_map: Dict[str, str] = {"num_classes": "num_labels", "dropout": "classifier_dropout"}
+    attribute_map: Dict[str, str] = {
+        "num_classes": "num_labels",
+        "dropout": "classifier_dropout",
+        "n_positions": "max_position_embeddings",
+        "n_embd": "hidden_size",
+        "n_layer": "num_hidden_layers",
+        "n_head": "num_attention_heads",
+        "n_inner": "intermediate_size",
+        "activation_function": "hidden_act",
+        "resid_pdrop": "attention_probs_dropout_prob",
+    }
+
     pretrained_init_configuration = GPT_PRETRAINED_INIT_CONFIGURATION
 
     def __init__(
diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import collections
 
@@ -24,6 +25,7 @@
 from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
 
+from ...utils.converter import StateDictNameMapping
 from ...utils.log import logger
 from .. import PretrainedModel, register_base_model
 from ..model_outputs import (
@@ -460,6 +462,79 @@ class GPTPretrainedModel(PretrainedModel):
     base_model_prefix = "gpt"
     config_class = GPTConfig
 
+    @classmethod
+    def _get_name_mappings(cls, config: GPTConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["wte.weight", "embeddings.word_embeddings.weight"],
+            ["wpe.weight", "embeddings.position_embeddings.weight"],
+            ["ln_f.weight", "decoder.norm.weight"],
+            ["ln_f.bias", "decoder.norm.bias"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"h.{layer_index}.ln_1.weight", f"decoder.layers.{layer_index}.norm1.weight"],
+                [f"h.{layer_index}.ln_1.bias", f"decoder.layers.{layer_index}.norm1.bias"],
+                [f"h.{layer_index}.ln_2.weight", f"decoder.layers.{layer_index}.norm2.weight"],
+                [f"h.{layer_index}.ln_2.bias", f"decoder.layers.{layer_index}.norm2.bias"],
+                [f"h.{layer_index}.mlp.c_fc.weight", f"decoder.layers.{layer_index}.linear1.weight"],
+                [f"h.{layer_index}.mlp.c_fc.bias", f"decoder.layers.{layer_index}.linear1.bias"],
+                [f"h.{layer_index}.mlp.c_proj.weight", f"decoder.layers.{layer_index}.linear2.weight"],
+                [f"h.{layer_index}.mlp.c_proj.bias", f"decoder.layers.{layer_index}.linear2.bias"],
+                [f"h.{layer_index}.attn.c_proj.weight", f"decoder.layers.{layer_index}.self_attn.out_proj.weight"],
+                [f"h.{layer_index}.attn.c_proj.bias", f"decoder.layers.{layer_index}.self_attn.out_proj.bias"],
+                # attention
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "split",
+                    0,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.bias",
+                    "split",
+                    0,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "split",
+                    1,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.bias",
+                    "split",
+                    1,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "split",
+                    2,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.bias",
+                    "split",
+                    2,
+                ],
+            ]
+
+            model_mappings.extend(layer_mappings)
+
+        if "GPT2Model" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "transformer." + mapping[0]
+                mapping[1] = "gpt." + mapping[1]
+
+        if "GPT2LMHeadModel" in config.architectures:
+            model_mappings.append(["lm_head.weight", "lm_head.decoder_weight"])
+
+        mappings = [StateDictNameMapping(*mapping) for mapping in model_mappings]
+        return mappings
+
     def init_weights(self, layer):
         """Initialization hook"""
         if isinstance(layer, (nn.Linear, nn.Embedding)):
diff --git a/paddlenlp/transformers/roberta/modeling.py b/paddlenlp/transformers/roberta/modeling.py
@@ -13,22 +13,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
 from .. import PretrainedModel, register_base_model
 from ..model_outputs import (
     BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
-    QuestionAnsweringModelOutput,
-    MultipleChoiceModelOutput,
-    MaskedLMOutput,
-    CausalLMOutputWithCrossAttentions,
 )
 from .configuration import PRETRAINED_INIT_CONFIGURATION, RobertaConfig
 
diff --git a/paddlenlp/utils/serialization.py b/paddlenlp/utils/serialization.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from __future__ import annotations
 
 import io
@@ -209,7 +208,7 @@ def persistent_load_stage1(saved_id):
     result_stage1 = unpickler_stage1.load()
 
     # 2. get the metadata of weight file
-    metadata = []
+    metadata = {}
 
     def extract_maybe_dict(result):
         if isinstance(result, dict):
@@ -219,11 +218,12 @@ def extract_maybe_dict(result):
             for res in result:
                 extract_maybe_dict(res)
         elif isinstance(result, TensorMeta):
-            if result not in metadata:
-                metadata.append(result)
+            metadata[result.key] = result
 
     extract_maybe_dict(result_stage1)
+    metadata = list(metadata.values())
     metadata = sorted(metadata, key=lambda x: x.key)
+
     # 3. parse the tensor of pytorch weight file
     stage1_key_to_tensor = {}
     content_size = os.stat(path).st_size
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,4 +1,4 @@
-paddlepaddle==2.4.0rc0
+paddlepaddle>=2.4.1
 pre-commit
 pytest
 parameterized
diff --git a/tests/transformers/gpt/test_modeling.py b/tests/transformers/gpt/test_modeling.py
diff --git a/tests/transformers/roberta/test_modeling.py b/tests/transformers/roberta/test_modeling.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-paddlepaddle==2.4.0rc0`
	`1`	`+paddlepaddle>=2.4.1`
`2`	`2`	`pre-commit`
`3`	`3`	`pytest`
`4`	`4`	`parameterized`