Implement InTokens data stream for ChatGLM (#6701)

sijunhe · web-flow · commit 46e55512d457 · 2023-08-13T22:16:59.000+08:00
* fix styles

* to list

* benchmarks

* ready for PR
diff --git a/examples/benchmark/peft/paddle/benchmark.py b/examples/benchmark/peft/paddle/benchmark.py
@@ -142,23 +142,21 @@ def preprocess_function_chatglm(example, max_src_length=256, max_tgt_length=384,
         model_inputs["input_ids"] = model_inputs["input_ids"][:-1]
         model_inputs["labels"] = model_inputs["labels"][1:]
 
-        context_length = model_inputs["input_ids"].index(tokenizer.bos_token_id)
-        seq_length = len(model_inputs["input_ids"])
-        position_ids = np.arange(seq_length, dtype=np.int64)
-        block_position_ids = np.concatenate(
-            [
-                np.zeros(context_length, dtype=np.int64),
-                np.arange(1, seq_length - context_length + 1, dtype=np.int64),
-            ]
-        )
-        model_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
-        # attention mask
         if intokens:
-            attention_mask = np.ones((seq_length, seq_length))
-            attention_mask = np.tril(attention_mask)
+            context_length = model_inputs["input_ids"].index(tokenizer.bos_token_id)
+            seq_length = len(model_inputs["input_ids"])
+            position_ids = np.arange(seq_length, dtype=np.int64)
+            block_position_ids = np.concatenate(
+                [
+                    np.zeros(context_length, dtype=np.int64),
+                    np.arange(1, seq_length - context_length + 1, dtype=np.int64),
+                ]
+            )
+            model_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+            attention_mask = np.tri(seq_length, seq_length, dtype=bool)
             attention_mask[:, :context_length] = 1
-            attention_mask = (attention_mask < 0.5).astype("int64")
             model_inputs["attention_mask"] = attention_mask
+
         return model_inputs
 
     def preprocess_function_bloom(example, max_src_length=256, max_tgt_length=384, intokens=False):
diff --git a/llm/causallm/data.py b/llm/causallm/data.py
@@ -88,7 +88,7 @@ def convert_example_common(example, tokenizer, data_args, is_test=True, intokens
         input_ids = tokenized_source["input_ids"] + tokenized_target_input_ids
         source_length = len(tokenized_source["input_ids"])
         labels = [-100] * source_length + input_ids[source_length:]
-        # shift labels
+        # shift input_ids and labels
         input_ids, labels = input_ids[:-1], labels[1:]
         features = {
             "input_ids": input_ids,
@@ -97,7 +97,7 @@ def convert_example_common(example, tokenizer, data_args, is_test=True, intokens
         seq_length = len(input_ids)
         if intokens:
             features["position_ids"] = list(range(seq_length))
-            features["attention_mask"] = np.tril(np.ones((seq_length, seq_length), dtype="bool"))
+            features["attention_mask"] = np.tri((seq_length, seq_length), dtype=bool)
 
         return features
 
@@ -115,15 +115,28 @@ def convert_example_chatglm(example, tokenizer, data_args, is_test=True, intoken
     else:
         input_ids = tokenized_source["input_ids"] + tokenized_target_input_ids
         bos_position = len(tokenized_source["input_ids"]) - 1
-
-        attention_mask = np.tri(len(input_ids), len(input_ids))
-        attention_mask[:, :bos_position] = 1
-        attention_mask = attention_mask[None, :, :]
-
         labels = [-100] * bos_position + input_ids[bos_position:]
-
-        # shift labels
+        # shift input_ids and labels
         input_ids, labels = input_ids[:-1], labels[1:]
-        attention_mask = attention_mask[..., :-1, :-1]
+        features = {
+            "input_ids": input_ids,
+            "labels": labels,
+        }
 
-        return dict(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+        if intokens:
+            seq_length = len(input_ids)
+            # attention_mask
+            attention_mask = np.tri(seq_length, seq_length, dtype=bool)
+            attention_mask[:, :bos_position] = 1
+            features["attention_mask"] = attention_mask
+            # 2d position_ids
+            position_ids = np.arange(seq_length, dtype=np.int64)
+            block_position_ids = np.concatenate(
+                [
+                    np.zeros(bos_position, dtype=np.int64),
+                    np.arange(1, seq_length - bos_position + 1, dtype=np.int64),
+                ]
+            )
+            features["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+
+        return features
diff --git a/llm/causallm/finetune_generation.py b/llm/causallm/finetune_generation.py
@@ -118,8 +118,8 @@ def main():
             train_ds, dev_ds = load_dataset(data_args.dataset_name_or_path, splits=["train", "dev"])
     trans_func = partial(get_convert_example(model), tokenizer=tokenizer, data_args=data_args)
     if data_args.intokens:
-        if model.base_model_prefix not in ["llama", "bloom"]:
-            raise NotImplementedError("InTokens data stream is only implemented for LLaMA、 Bloom so far.")
+        if model.base_model_prefix not in ["llama", "bloom", "chatglm"]:
+            raise NotImplementedError("InTokens data stream is only implemented for LLaMA, Bloom and ChatGLM so far.")
     train_ds = train_ds.map(partial(trans_func, is_test=False, intokens=data_args.intokens))
     eval_intokens = data_args.intokens
     if data_args.intokens and data_args.eval_with_do_generation:
diff --git a/paddlenlp/datasets/intokens_dataset.py b/paddlenlp/datasets/intokens_dataset.py
@@ -46,17 +46,16 @@ def _pad_batch_records(cls, batch_records):
             # If attention_mask is not given, assume it's causal mask
             attention_mask = record.get("attention_mask", np.tril(np.ones([seq_length, seq_length], dtype=bool)))
             batched_features["attention_mask"].append(attention_mask)
-            # TODO: to adapt to chatglm position_2d
             # NOTE: position_ids is optional and not required by every model
+            # We append instead of extend here to accomodate 2D position ids
             if "position_ids" in record:
-                batched_features["position_ids"].extend(record["position_ids"])
+                batched_features["position_ids"].append(record["position_ids"])
         block_attention_mask = block_diag(*batched_features["attention_mask"])
         # convert to 3-D [batch_size(1), seq_length, seq_length]
         batched_features["attention_mask"] = np.expand_dims(block_attention_mask, axis=0)
-        # batched_features["input_ids"] = np.array(batched_features["input_ids"], dtype=np.int64)
-        # batched_features["labels"] = np.array(batched_features["labels"], dtype=np.int64)
-        # if "position_ids" in record:
-        #     batched_features["position_ids"] = np.array(batched_features["position_ids"], dtype=np.int64)
+        if "position_ids" in batched_features:
+            # Accomodate both 1D and 2D position ids
+            batched_features["position_ids"] = np.concatenate(batched_features["position_ids"], axis=-1).tolist()
         return batched_features
 
 
diff --git a/paddlenlp/transformers/chatglm/modeling.py b/paddlenlp/transformers/chatglm/modeling.py
@@ -581,8 +581,8 @@ class ChatGLMPretrainedModel(PretrainedModel):
     model_config_file = CONFIG_NAME
     resource_files_names = {"model_state": "model_state.pdparams"}
     pretrained_resource_files_map = CHATGLM_PRETRAINED_RESOURCE_FILES_MAP
-    _keys_to_ignore_on_load_missing = [r"transformer.layers.*.attention.rotary_embeddings.inv_freq"]
-    _keys_to_ignore_on_load_unexpected = [r"transformer.layers.*.attention.rotary_emb.inv_freq"]
+    _keys_to_ignore_on_load_missing = [r"transformer.rotary_embeddings.inv_freq", r"lm_head.decoder_weight"]
+    _keys_to_ignore_on_load_unexpected = [r"transformer.rotary_emb.inv_freq"]
 
     def init_weights(self, layer):
         """Initialization hook"""
diff --git a/paddlenlp/transformers/conversion_utils.py b/paddlenlp/transformers/conversion_utils.py
@@ -1025,7 +1025,7 @@ def convert_tensor_parallel(
         if state_dict is None:
             with device_guard("cpu"):
                 state_dict = paddle.load(weight_file, return_numpy=False)
-            logger.info("starting convert orignal state_dict to tensor parallel state_dict.")
+            logger.info("Starting to convert orignal state_dict to tensor parallel state_dict.")
 
         state_keys_map = cls._resolve_prefix_keys(name_action_mappings.keys(), state_dict.keys(), ignore_error)
 
@@ -1035,7 +1035,7 @@ def convert_tensor_parallel(
         for name, action in name_action_mappings.items():
             if name not in state_dict:
                 if not ignore_error:
-                    logger.warning(f"key<{name}> not in the model state weight file.")
+                    logger.warning(f"Key <{name}> not in the model state weight file.")
                 continue
             tensor = state_dict.pop(name)
             new_tensor = action(tensor)
diff --git a/tests/dataset/test_intokens.py b/tests/dataset/test_intokens.py
@@ -36,7 +36,7 @@ class InTokensTestCommon:
     expected_output = {
         "input_ids": [1, 29871, 30429, 1, 29871, 30429, 2, 1, 29871, 31427, 1, 29871, 31427, 2],
         "labels": [-100, -100, -100, 1, 29871, 30429, 2, -100, -100, -100, 1, 29871, 31427, 2],
-        "position_ids": [0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6],
+        "position_ids": np.array([0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6]),
         "attention_mask": np.array(
             [
                 [
@@ -57,25 +57,34 @@ class InTokensTestCommon:
                 ]
             ]
         ),
+        "position_ids_2d": [[0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6]],
     }
 
-    def preprocess_fn(self, example, max_src_length=3, max_tgt_length=3):
+    def preprocess_fn(
+        self,
+        example,
+        max_src_length=3,
+        max_tgt_length=3,
+        return_position_ids=True,
+        position_ids_2d=False,
+        return_attention_mask=True,
+    ):
         inputs = example["sentence"][:2]
         model_inputs = self.tokenizer(inputs, max_length=max_src_length, truncation=True, return_attention_mask=False)
         labels_input_ids = model_inputs["input_ids"] + [self.tokenizer.eos_token_id]
         model_inputs["labels"] = [-100] * len(model_inputs["input_ids"]) + labels_input_ids
         model_inputs["input_ids"] = model_inputs["input_ids"] + labels_input_ids
         seq_length = len(model_inputs["input_ids"])
-        model_inputs["position_ids"] = list(range(seq_length))
-        model_inputs["attention_mask"] = np.tril(np.ones([seq_length, seq_length]))
-        return model_inputs
-
-    def preprocess_fn_input_labels_only(self, example, max_src_length=3, max_tgt_length=3):
-        inputs = example["sentence"][:2]
-        model_inputs = self.tokenizer(inputs, max_length=max_src_length, truncation=True, return_attention_mask=False)
-        labels_input_ids = model_inputs["input_ids"] + [self.tokenizer.eos_token_id]
-        model_inputs["labels"] = [-100] * len(model_inputs["input_ids"]) + labels_input_ids
-        model_inputs["input_ids"] = model_inputs["input_ids"] + labels_input_ids
+        if return_position_ids:
+            if position_ids_2d:
+                position_ids = np.arange(seq_length, dtype=np.int64)
+                # fake block_position_ids with wrong values but correct shape
+                block_position_ids = np.arange(seq_length, dtype=np.int64)
+                model_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+            else:
+                model_inputs["position_ids"] = list(range(seq_length))
+        if return_attention_mask:
+            model_inputs["attention_mask"] = np.tril(np.ones([seq_length, seq_length]))
         return model_inputs
 
 
@@ -89,10 +98,14 @@ def setUpClass(cls):
             data_files=[os.path.join(fixture_path, "tnews", "train.json")],
             lazy=False,
         )
-        copy_train_ids = copy.deepcopy(cls.train_ds)
+        copy_dataset_1 = copy.deepcopy(cls.train_ds)
+        copy_dataset_2 = copy.deepcopy(cls.train_ds)
         cls.dataset = cls.train_ds.map(lambda example: cls.preprocess_fn(cls, example))
-        cls.dataset_input_labels_only = copy_train_ids.map(
-            lambda example: cls.preprocess_fn_input_labels_only(cls, example)
+        cls.dataset_position_2d = copy_dataset_1.map(
+            lambda example: cls.preprocess_fn(cls, example, position_ids_2d=True)
+        )
+        cls.dataset_input_labels_only = copy_dataset_2.map(
+            lambda example: cls.preprocess_fn(cls, example, return_position_ids=False, return_attention_mask=False)
         )
 
     def test_long_max_length(self):
@@ -111,8 +124,8 @@ def test_long_max_length(self):
     def test_short_max_length(self):
         inData = InTokensMapDataset(self.dataset, self.tokenizer, max_length=16)
         self.assertEqual(inData[0]["input_ids"], self.expected_output["input_ids"])
-        self.assertEqual(inData[0]["position_ids"], self.expected_output["position_ids"])
         self.assertEqual(inData[0]["labels"], self.expected_output["labels"])
+        self.assertTrue((inData[0]["position_ids"] == self.expected_output["position_ids"]).all())
         self.assertTrue((inData[0]["attention_mask"] == self.expected_output["attention_mask"]).all())
 
         inData_input_labels_only = InTokensMapDataset(self.dataset_input_labels_only, self.tokenizer, max_length=16)
@@ -122,6 +135,10 @@ def test_short_max_length(self):
             (inData_input_labels_only[0]["attention_mask"] == self.expected_output["attention_mask"]).all()
         )
 
+    def test_2d_position_id(self):
+        inData_2d = InTokensMapDataset(self.dataset_position_2d, self.tokenizer, max_length=16)
+        self.assertTrue((inData_2d[0]["position_ids"] == self.expected_output["position_ids_2d"]).all())
+
     def test_missing_data(self):
         orginal_input_ids = [item["input_ids"] for item in self.dataset]
         orginal_input_ids = [sum(orginal_input_ids, [])]
@@ -138,10 +155,14 @@ def setUpClass(cls):
         cls.train_ds = load_dataset(
             read_local_dataset, path=os.path.join(fixture_path, "tnews", "train.json"), lazy=True
         )
-        copy_train_ids = copy.deepcopy(cls.train_ds)
+        copy_dataset_1 = copy.deepcopy(cls.train_ds)
+        copy_dataset_2 = copy.deepcopy(cls.train_ds)
         cls.dataset = cls.train_ds.map(lambda example: cls.preprocess_fn(cls, example))
-        cls.dataset_input_labels_only = copy_train_ids.map(
-            lambda example: cls.preprocess_fn_input_labels_only(cls, example)
+        cls.dataset_position_2d = copy_dataset_1.map(
+            lambda example: cls.preprocess_fn(cls, example, position_ids_2d=True)
+        )
+        cls.dataset_input_labels_only = copy_dataset_2.map(
+            lambda example: cls.preprocess_fn(cls, example, return_position_ids=False, return_attention_mask=False)
         )
 
     def test_long_max_length(self):
@@ -174,8 +195,8 @@ def test_short_max_length(self):
             example.append(item)
             break
         self.assertEqual(example[0]["input_ids"], self.expected_output["input_ids"])
-        self.assertEqual(example[0]["position_ids"], self.expected_output["position_ids"])
         self.assertEqual(example[0]["labels"], self.expected_output["labels"])
+        self.assertTrue((example[0]["position_ids"] == self.expected_output["position_ids"]).all())
         self.assertTrue((example[0]["attention_mask"] == self.expected_output["attention_mask"]).all())
 
         inData_input_labels_only = InTokensIterableDataset(
@@ -189,6 +210,14 @@ def test_short_max_length(self):
         self.assertEqual(example[0]["labels"], self.expected_output["labels"])
         self.assertTrue((example[0]["attention_mask"] == self.expected_output["attention_mask"]).all())
 
+    def test_2d_position_id(self):
+        inData_2d = InTokensIterableDataset(self.dataset_position_2d, self.tokenizer, max_length=16)
+        example = []
+        for item in inData_2d:
+            example.append(item)
+            break
+        self.assertTrue((example[0]["position_ids"] == self.expected_output["position_ids_2d"]).all())
+
     def test_missing_data(self):
         orginal_input_ids = [item["input_ids"] for item in self.dataset]
         orginal_input_ids = [sum(orginal_input_ids, [])]