PaddlePaddle · tsinghua-zhang · Apr 6, 2023 · Apr 7, 2023 · Apr 14, 2023 · Apr 19, 2023
diff --git a/examples/few_shot/prefix-tuning/run_train.py b/examples/few_shot/prefix-tuning/run_train.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Optional
+
+import paddle
+from utils import PromptTrainerForGeneration, compute_metrics
+
+from paddlenlp.datasets import load_dataset
+from paddlenlp.prompt import (
+    PrefixTemplate,
+    PromptModelForGeneration,
+    PromptTuningArguments,
+)
+from paddlenlp.trainer import PdArgumentParser
+from paddlenlp.transformers import AutoTokenizer, GPTLMHeadModel
+from paddlenlp.utils.log import logger
+
+
+@dataclass
+class DataArguments:
+    prompt: str = field(
+        default="{'prefix':'None'}{'text':'text'}{'sep'}{'text':'labels', 'token_type': 1}",
+        metadata={"help": "Add prompt.'prefix'、'text' variable and 'text':'labels' immutable."},
+    )
+    task_name: str = field(default="dureader_qg", metadata={"help": "The name of task."})
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: str = field(
+        default="gpt-cpm-small-cn-distill",
+        metadata={"help": "Build-in pretrained model name or the path to local model."},
+    )
+    export_type: str = field(default="paddle", metadata={"help": "The type to export. Support `paddle` and `onnx`."})
+    dropout: float = field(default=0.1, metadata={"help": "The dropout used for pretrained model."})
+    predict_with_generate: Optional[bool] = field(
+        default=True,
+        metadata={"help": ("Whether to generate in predcit.")},
+    )
+    num_beams: Optional[int] = field(
+        default=2,
+        metadata={"help": ("The number of beams to use in beam search.")},
+    )
+    max_target_length: Optional[int] = field(
+        default=16,
+        metadata={
+            "help": (
+                "The maximum total sequence length for target text after "
+                "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded."
+                "during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+
+
+def main():
+    # Parse the arguments.
+    parser = PdArgumentParser((ModelArguments, DataArguments, PromptTuningArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    training_args.generation_max_length = model_args.max_target_length
+    training_args.predict_with_generate = model_args.predict_with_generate
+    training_args.generation_num_beams = model_args.num_beams
+
+    training_args.print_config(model_args, "Model")
+    training_args.print_config(data_args, "Data")
+    paddle.set_device(training_args.device)
+
+    # Load the pretrained language model.
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    tokenizer.pad_token = "<pad>"
+    tokenizer.sep_token = "<sep>"
+    tokenizer.add_tokens("[Space]", special_tokens=True)
+    model = GPTLMHeadModel.from_pretrained(
+        model_args.model_name_or_path,
+        hidden_dropout_prob=model_args.dropout,
+        attention_probs_dropout_prob=model_args.dropout,
+    )
+
+    # Define template for preprocess.
+    template = PrefixTemplate(data_args.prompt, tokenizer, training_args.max_seq_length, model)
+    logger.info("Using template: {}".format(template.prompt))
+
+    # Load datasets.
+    train_ds, dev_ds = load_dataset(data_args.task_name, splits=["train", "dev"])
+
+    def convert_label_keyword(input_dict):
+        if "text" not in input_dict:
+            input_dict["text"] = input_dict.pop("title") + tokenizer.sep_token + input_dict.pop("source")
+        if "labels" not in input_dict:
+            input_dict["labels"] = input_dict.pop("target")
+        return input_dict
+
+    train_ds.map(convert_label_keyword)
+    dev_ds.map(convert_label_keyword)
+
+    # Initialize the prompt model with the above variables.
+    prompt_model = PromptModelForGeneration(
+        model,
+        template,
+        freeze_plm=training_args.freeze_plm,
+        freeze_dropout=training_args.freeze_dropout,
+    )
+
+    dev_compute_metrics = partial(compute_metrics, tokenizer=tokenizer)
+    trainer = PromptTrainerForGeneration(
+        model=prompt_model,
+        tokenizer=tokenizer,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=dev_ds,
+        callbacks=None,
+        compute_metrics=dev_compute_metrics,
+    )
+
+    # Traininig.
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        metrics = train_result.metrics
+        trainer.save_model()
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    if training_args.do_eval:
+        eval_metrics = trainer.evaluate()
+        trainer.log_metrics("eval", eval_metrics)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/few_shot/prefix-tuning/utils.py b/examples/few_shot/prefix-tuning/utils.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from rouge import Rouge
+
+from paddlenlp.metrics import BLEU
+from paddlenlp.prompt import PromptTrainer
+
+
+# Define the metric function.
+def compute_metrics(eval_preds, tokenizer):
+
+    all_preds = []
+    all_labels = []
+    labels = eval_preds.label_ids
+    preds = eval_preds.predictions
+    all_preds.extend(tokenizer.convert_ids_to_string(pred.tolist()) for pred in preds)
+    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    all_labels.extend(tokenizer.convert_ids_to_string(label.tolist()) for label in labels)
+
+    assert len(all_preds) == len(all_labels), (
+        "The length of pred_responses should be equal to the length of "
+        "target_responses. But received {} and {}.".format(len(all_preds), len(all_labels))
+    )
+    rouge = Rouge()
+    bleu4 = BLEU(n_size=4)
+    scores = []
+    for pred, target in zip(all_preds, all_labels):
+        try:
+            score = rouge.get_scores(" ".join(pred), " ".join(target))
+            scores.append([score[0]["rouge-1"]["f"], score[0]["rouge-2"]["f"], score[0]["rouge-l"]["f"]])
+        except ValueError:
+            scores.append([0, 0, 0])
+        bleu4.add_inst(pred, [target])
+    rouge1 = np.mean([i[0] for i in scores])
+    rouge2 = np.mean([i[1] for i in scores])
+    rougel = np.mean([i[2] for i in scores])
+    print("\n" + "*" * 15)
+    print("The auto evaluation result is:")
+    print("rouge-1:", round(rouge1, 4))
+    print("rouge-2:", round(rouge2, 4))
+    print("rouge-L:", round(rougel, 4))
+    print("BLEU-4:", round(bleu4.score(), 4))
+    return {"rougel": rougel}
+
+
+class PromptTrainerForGeneration(PromptTrainer):
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        criterion=None,
+        args=None,
+        data_collator=None,
+        train_dataset=None,
+        eval_dataset=None,
+        compute_metrics=None,
+        callbacks=None,
+        optimizers=(None, None),
+    ):
+        super(PromptTrainerForGeneration, self).__init__(
+            model=model,
+            criterion=criterion,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+        self.verbalizer = None
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+
+        Subclass and override for custom behavior.
+        """
+        outputs = model(**inputs)
+
+        # Save past state if it exists
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        # print(outputs[0])
+        # We don't use .loss here since the model may return tuples instead of ModelOutput.
+        # print(outputs[0], outputs.loss)
+        # URGENT
+        # print('compute_loss', outputs[0])
+        loss = outputs[0]
+
+        return (loss, outputs) if return_outputs else loss
+
+    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval", **gen_kwargs):
+        """
+        Run evaluation and returns metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
+        (pass it to the init `compute_metrics` argument).
+
+        You can also subclass and override this method to inject custom behavior.
+
+        Args:
+            eval_dataset (`Dataset`, *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
+                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
+                method.
+            ignore_keys (`List[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
+                The maximum target length to use when predicting with the generate method.
+            num_beams (`int`, *optional*):
+                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
+                beam search.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
+
+        Returns:
+            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
+            dictionary also contains the epoch number which comes from the training state.
+        """
+
+        gen_kwargs = gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
+        )
+        self._gen_kwargs = gen_kwargs
+
+        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
+
+    def prediction_step(
+        self,
+        model,
+        inputs,
+        prediction_loss_only,
+        ignore_keys=None,
+    ):
+        """
+        Perform an evaluation step on `model` using `inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Layer`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            Tuple[Optional[float], Optional[paddle.Tensor], Optional[paddle.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+        has_labels = "labels" in inputs
+        labels = inputs["labels"]
+        # inputs = self._prepare_inputs(inputs)
+
+        gen_kwargs = self._gen_kwargs.copy()
+
+        if "attention_mask" in inputs:
+            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
+        if "global_attention_mask" in inputs:
+            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
+
+        generated_tokens = self.model.generate(
+            **inputs,
+            **gen_kwargs,
+            use_cache=True,
+            use_fp16_decoding=True,
+            repetition_penalty=2.0,
+        )
+
+        # different from hf returns: tuple[Tensor]: It is a tuple contains two elements: ids and scores.
+        if isinstance(generated_tokens, tuple):
+            generated_tokens = generated_tokens[0]
+        # in case the batch is shorter than max length, the output should be padded
+        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
+            gen_kwargs["max_new_tokens"] + 1
+        ):
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)
+
+        with paddle.no_grad():
+            if has_labels:
+                with self.autocast_smart_context_manager():
+                    loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
+                loss = loss.mean().detach()
+            else:
+                loss = None
+
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
+
+        return (loss, generated_tokens, labels)
+
+    def _pad_tensors_to_max_len(self, tensor, max_length):
+        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
+            # If PAD token is not defined at least EOS token has to be defined
+            pad_token_id = (
+                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            )
+        else:
+            if self.tokenizer.pad_token_id is not None:
+                pad_token_id = self.tokenizer.pad_token_id
+            else:
+                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
+        # paddle.ones need to support device args.
+        padded_tensor = pad_token_id * paddle.ones((tensor.shape[0], max_length), dtype=tensor.dtype)
+        padded_tensor[:, : tensor.shape[-1]] = tensor
+        return padded_tensor