From f55c569a441ff9f75e5ace07a2b87c7d16f1f31f Mon Sep 17 00:00:00 2001 From: tsinghua-zhang Date: Thu, 6 Apr 2023 10:13:51 +0800 Subject: [PATCH 1/5] prefix-tuning --- .../prefix-tuning/prompt/lcsts_new.json | 5 + examples/few_shot/prefix-tuning/run_train.py | 176 ++++++++++++ examples/few_shot/prefix-tuning/utils.py | 265 ++++++++++++++++++ paddlenlp/prompt/prompt_model.py | 130 ++++++++- paddlenlp/prompt/prompt_tokenizer.py | 2 +- paddlenlp/prompt/prompt_utils.py | 10 + paddlenlp/prompt/template.py | 2 + paddlenlp/transformers/gpt/tokenizer.py | 8 +- 8 files changed, 593 insertions(+), 5 deletions(-) create mode 100644 examples/few_shot/prefix-tuning/prompt/lcsts_new.json create mode 100644 examples/few_shot/prefix-tuning/run_train.py create mode 100644 examples/few_shot/prefix-tuning/utils.py diff --git a/examples/few_shot/prefix-tuning/prompt/lcsts_new.json b/examples/few_shot/prefix-tuning/prompt/lcsts_new.json new file mode 100644 index 000000000000..af3fdb637ca7 --- /dev/null +++ b/examples/few_shot/prefix-tuning/prompt/lcsts_new.json @@ -0,0 +1,5 @@ +{ + "template": [ + {"text": "{'prefix':'文本摘要'}{'text':'source'}{'sep'}{'text':'target', 'token_type': 1}"} + ] +} diff --git a/examples/few_shot/prefix-tuning/run_train.py b/examples/few_shot/prefix-tuning/run_train.py new file mode 100644 index 000000000000..88ad394adff9 --- /dev/null +++ b/examples/few_shot/prefix-tuning/run_train.py @@ -0,0 +1,176 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from dataclasses import dataclass, field +from functools import partial +from typing import Optional + +import paddle +from paddle.static import InputSpec +from utils import compute_metrics, load_prompt_arguments, new_PromptTrainer + +from paddlenlp.datasets import load_dataset +from paddlenlp.prompt import ( + PrefixTemplate, + PromptModelForGeneration, + PromptTuningArguments, +) +from paddlenlp.trainer import PdArgumentParser +from paddlenlp.transformers import AutoTokenizer, GPTLMHeadModel +from paddlenlp.utils.log import logger + + +@dataclass +class DataArguments: + prompt_path: str = field(default="prompt/eprstmt.json", metadata={"help": "Path to the defined prompts."}) + prompt_index: int = field(default=0, metadata={"help": "The index of defined prompt for training."}) + augment_type: str = field( + default=None, + metadata={ + "help": "The strategy used for data augmentation, including `swap`, `delete`, `insert`, `subsitute`." + }, + ) + num_augment: str = field( + default=5, metadata={"help": "Number of augmented data per example, which works when `augment_type` is set."} + ) + word_augment_percent: str = field( + default=0.1, + metadata={ + "help": "Percentage of augmented words in sequences, used for `swap`, `delete`, `insert`, `subsitute`." + }, + ) + augment_method: str = field(default="mlm", metadata={"help": "Strategy used for `insert` and `subsitute`."}) + do_label: bool = field( + default=False, metadata={"help": "Whether to label unsupervised data in unlabeled datasets"} + ) + do_test: bool = field(default=False, metadata={"help": "Whether to evaluate model on public test datasets."}) + + +@dataclass +class ModelArguments: + model_name_or_path: str = field( + default="gpt-cpm-small-cn-distill", + metadata={"help": "Build-in pretrained model name or the path to local model."}, + ) + export_type: str = field(default="paddle", metadata={"help": "The type to export. Support `paddle` and `onnx`."}) + dropout: float = field(default=0.1, metadata={"help": "The dropout used for pretrained model."}) + predict_with_generate: Optional[bool] = field( + default=True, + metadata={"help": ("Whether to generate in predcit.")}, + ) + num_beams: Optional[int] = field( + default=1, + metadata={"help": ("The number of beams to use in beam search.")}, + ) + max_target_length: Optional[int] = field( + default=64, + metadata={ + "help": ( + "The maximum total sequence length for target text after " + "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded." + "during ``evaluate`` and ``predict``." + ) + }, + ) + + +def main(): + # Parse the arguments. + parser = PdArgumentParser((ModelArguments, DataArguments, PromptTuningArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + data_args = load_prompt_arguments(data_args) + + training_args.generation_max_length = model_args.max_target_length + training_args.predict_with_generate = model_args.predict_with_generate + training_args.generation_num_beams = model_args.num_beams + + training_args.print_config(model_args, "Model") + training_args.print_config(data_args, "Data") + paddle.set_device(training_args.device) + + # Load the pretrained language model. + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + tokenizer.pad_token = "" + tokenizer.sep_token = "" + tokenizer.add_tokens("[Space]", special_tokens=True) + model = GPTLMHeadModel.from_pretrained( + model_args.model_name_or_path, + hidden_dropout_prob=model_args.dropout, + attention_probs_dropout_prob=model_args.dropout, + ) + + # Define template for preprocess and verbalizer for postprocess. + template = PrefixTemplate(data_args.prompt, tokenizer, training_args.max_seq_length, model) + logger.info("Using template: {}".format(template.prompt)) + + # Load datasets. + train_ds, dev_ds = load_dataset("lcsts_new") + dev_ds_label = dev_ds.map(lambda x: {x["target"]}, dev_ds) + train_ds, dev_ds = load_dataset("lcsts_new") + + # Initialize the prompt model with the above variables. + prompt_model = PromptModelForGeneration( + model, + template, + verbalizer=None, + freeze_plm=training_args.freeze_plm, + freeze_dropout=training_args.freeze_dropout, + max_predict_len=training_args.generation_max_length, + ) + + dev_compute_metrics = partial(compute_metrics, tokenizer=tokenizer, labels=dev_ds_label) + trainer = new_PromptTrainer( + model=prompt_model, + tokenizer=tokenizer, + args=training_args, + train_dataset=train_ds, + eval_dataset=dev_ds, + callbacks=None, + compute_metrics=dev_compute_metrics, + ) + + # Traininig. + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + metrics = train_result.metrics + trainer.save_model() + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + if training_args.do_eval: + eval_metrics = trainer.evaluate() + trainer.log_metrics("eval", eval_metrics) + + # Export static model. + if training_args.do_export: + template = prompt_model.template + template_keywords = template.extract_template_keywords(template.prompt) + input_spec = [ + InputSpec(shape=[None, None], dtype="int64"), # input_ids, + InputSpec(shape=[None, None], dtype="int64"), # token_type_ids + InputSpec(shape=[None, None], dtype="int64"), # position_ids + InputSpec(shape=[None, None, None, None], dtype="float32"), # attention_mask + InputSpec(shape=[None], dtype="int64"), # masked_positions + InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids + ] + if "encoder" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], dtype="int64")) # encoder_ids + export_path = os.path.join(training_args.output_dir, "export") + trainer.export_model(export_path, input_spec=input_spec, export_type=model_args.export_type) + + +if __name__ == "__main__": + main() diff --git a/examples/few_shot/prefix-tuning/utils.py b/examples/few_shot/prefix-tuning/utils.py new file mode 100644 index 000000000000..ae69d5adb20b --- /dev/null +++ b/examples/few_shot/prefix-tuning/utils.py @@ -0,0 +1,265 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json + +import numpy as np +import paddle +from rouge import Rouge + +from paddlenlp.metrics import BLEU +from paddlenlp.prompt import PromptTrainer + + +def load_prompt_arguments(args): + """ + Load prompt and label words according to prompt index. + """ + with open(args.prompt_path, "r", encoding="utf-8") as fp: + configs = json.load(fp) + args.prompt = configs["template"][args.prompt_index]["text"] + return args + + +# Define the metric function. +def compute_metrics(eval_preds, tokenizer, labels): + + all_preds = [] + all_labels = [] + labels = eval_preds.label_ids + preds = eval_preds.predictions + all_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + all_labels.extend(tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + + assert len(all_preds) == len(all_labels), ( + "The length of pred_responses should be equal to the length of " + "target_responses. But received {} and {}.".format(len(all_preds), len(all_labels)) + ) + rouge = Rouge() + bleu4 = BLEU(n_size=4) + scores = [] + for pred, target in zip(all_preds, all_labels): + try: + score = rouge.get_scores(" ".join(pred), " ".join(target)) + scores.append([score[0]["rouge-1"]["f"], score[0]["rouge-2"]["f"], score[0]["rouge-l"]["f"]]) + except ValueError: + scores.append([0, 0, 0]) + bleu4.add_inst(pred, [target]) + rouge1 = np.mean([i[0] for i in scores]) + rouge2 = np.mean([i[1] for i in scores]) + rougel = np.mean([i[2] for i in scores]) + print("\n" + "*" * 15) + print("The auto evaluation result is:") + print("rouge-1:", round(rouge1, 4)) + print("rouge-2:", round(rouge2, 4)) + print("rouge-L:", round(rougel, 4)) + print("BLEU-4:", round(bleu4.score(), 4)) + return {"rougel": rougel} + + +class new_PromptTrainer(PromptTrainer): + def __init__( + self, + model, + tokenizer, + criterion=None, + args=None, + data_collator=None, + train_dataset=None, + eval_dataset=None, + compute_metrics=None, + callbacks=None, + optimizers=(None, None), + ): + super(new_PromptTrainer, self).__init__( + model=model, + criterion=criterion, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + ) + + def compute_loss(self, model, inputs, return_outputs=False): + """ + How the loss is computed by Trainer. By default, all models return the loss in the first element. + + Subclass and override for custom behavior. + """ + outputs = model(**inputs) + + # Save past state if it exists + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + # print(outputs[0]) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + # print(outputs[0], outputs.loss) + # URGENT + # print('compute_loss', outputs[0]) + loss = outputs[0] + + return (loss, outputs) if return_outputs else loss + + def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval", **gen_kwargs): + """ + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init `compute_metrics` argument). + + You can also subclass and override this method to inject custom behavior. + + Args: + eval_dataset (`Dataset`, *optional*): + Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns + not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` + method. + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is `"eval"` (default) + max_length (`int`, *optional*): + The maximum target length to use when predicting with the generate method. + num_beams (`int`, *optional*): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + gen_kwargs: + Additional `generate` specific kwargs. + + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + + gen_kwargs = gen_kwargs.copy() + if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: + gen_kwargs["max_length"] = self.args.generation_max_length + gen_kwargs["num_beams"] = ( + gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams + ) + self._gen_kwargs = gen_kwargs + + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + def prediction_step( + self, + model, + inputs, + prediction_loss_only, + ignore_keys=None, + ): + """ + Perform an evaluation step on `model` using `inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Layer`): + The model to evaluate. + inputs (`Dict[str, Union[paddle.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[paddle.Tensor], Optional[paddle.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + + gen_kwargs = self._gen_kwargs.copy() + if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: + gen_kwargs["max_length"] = self.model.config.max_length + gen_kwargs["num_beams"] = ( + gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams + ) + + if "attention_mask" in inputs: + gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) + if "global_attention_mask" in inputs: + gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None) + + generated_tokens = self.model.generate( + **inputs, + **gen_kwargs, + use_cache=True, + use_fp16_decoding=True, + ) + + # different from hf returns: tuple[Tensor]: It is a tuple contains two elements: ids and scores. + if isinstance(generated_tokens, tuple): + generated_tokens = generated_tokens[0] + # in case the batch is shorter than max length, the output should be padded + if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < ( + gen_kwargs["max_new_tokens"] + 1 + ): + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1) + + with paddle.no_grad(): + if has_labels: + with self.autocast_smart_context_manager(): + loss, outputs = self.compute_loss(model, inputs, return_outputs=True) + loss = loss.mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + if has_labels: + labels = inputs["labels"] + if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < ( + gen_kwargs["max_new_tokens"] + 1 + ): + labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1)) + else: + labels = None + + return (loss, generated_tokens, labels) + + def _pad_tensors_to_max_len(self, tensor, max_length): + if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): + # If PAD token is not defined at least EOS token has to be defined + pad_token_id = ( + self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + ) + else: + if self.model.config.pad_token_id is not None: + pad_token_id = self.model.config.pad_token_id + else: + raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors") + # paddle.ones need to support device args. + padded_tensor = pad_token_id * paddle.ones((tensor.shape[0], max_length), dtype=tensor.dtype) + padded_tensor[:, : tensor.shape[-1]] = tensor + return padded_tensor diff --git a/paddlenlp/prompt/prompt_model.py b/paddlenlp/prompt/prompt_model.py index 496662d5c7f6..a52e6eb0b485 100644 --- a/paddlenlp/prompt/prompt_model.py +++ b/paddlenlp/prompt/prompt_model.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. - from typing import Any, Dict, Optional import paddle from paddle.static import InputSpec +from ..transformers.generation_utils import GenerationMixin from ..transformers.model_outputs import ( MaskedLMOutput, MultipleChoiceModelOutput, @@ -160,3 +160,131 @@ def get_input_spec(self): if "encoder" in template_keywords: input_spec.append(InputSpec(shape=[None, None], dtype="int64", name="encoder_ids")) return input_spec + + +class PromptModelForGeneration(paddle.nn.Layer, GenerationMixin): + """ + PromptModel for classification tasks. + """ + + def __init__( + self, + model: paddle.nn.Layer, + template: Template, + verbalizer: Optional[Verbalizer] = None, + freeze_plm: bool = False, + freeze_dropout: bool = False, + max_predict_len: int = 32, + ): + super(PromptModelForGeneration, self).__init__() + self.plm = model + self.template = template + self.verbalizer = verbalizer + self.freeze_plm = freeze_plm + self.freeze_dropout = freeze_dropout + if self.freeze_plm: + for param in self.plm.parameters(): + param.stop_gradient = True + if self.freeze_dropout: + self.plm.eval() + self.forward_keys = signature(self.plm.forward) + self._mask_token_id = self.template.tokenizer.mask_token_id + self._pad_token_id = self.template.tokenizer.pad_token_id + if isinstance(self.template, PrefixTemplate): + self.plm = self.template.process_model(self.plm) + self.forward_keys.append("past_key_values") + self.max_predict_len = paddle.to_tensor(max_predict_len, dtype="int32") + + def forward( + self, + input_ids: paddle.Tensor, + token_type_ids: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + masked_positions: Optional[paddle.Tensor] = None, + soft_token_ids: Optional[paddle.Tensor] = None, + encoder_ids: Optional[paddle.Tensor] = None, + labels: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + **kwargs: Dict[str, Any] + ): + + return_dict = return_dict if return_dict is not None else False + if soft_token_ids is None: + outputs = self.plm(input_ids) + return outputs + + return_hidden_states = kwargs.get("return_hidden_states", False) + input_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "position_ids": position_ids, + "masked_positions": masked_positions, + "soft_token_ids": soft_token_ids, + "attention_mask": attention_mask, + "encoder_ids": encoder_ids, + "labels": labels, + **kwargs, + } + input_dict = self.template.process_batch(input_dict) + input_dict = {**input_dict, **kwargs} + model_inputs = {k: input_dict[k] for k in input_dict if k in self.forward_keys} + if "masked_positions" in model_inputs: + model_inputs.pop("masked_positions") + if "cache" in self.forward_keys: + model_inputs["cache"] = [] + for i in range(len(model_inputs["past_key_values"])): + from paddlenlp.transformers.gpt.modeling import MultiHeadAttention + + model_inputs["cache"].append( + MultiHeadAttention.Cache( + k=model_inputs["past_key_values"][i][0], v=model_inputs["past_key_values"][i][1] + ) + ) + model_inputs.pop("past_key_values") + model_outputs = self.plm(**model_inputs, return_dict=True, use_cache=True) + + logits = model_outputs.logits + shift_logits = logits[..., :-1, :] + mask = input_dict["token_type_ids"] == 1 + labels = labels * mask + shift_labels = labels[..., 1:] + loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=0, reduction="sum") + loss = loss_fct(shift_logits.reshape((-1, shift_logits.shape[-1])), shift_labels.reshape((-1,))) + num = int(paddle.count_nonzero(shift_labels.reshape((-1,)))) + loss = loss / num + + if not return_dict: + output = (logits,) + if return_hidden_states: + output = output + (model_outputs.logits,) + if loss is not None: + return (loss,) + output + if isinstance(output, (list, tuple)) and len(output) == 1: + output = output[0] + return output + + return MaskedLMOutput( + loss=loss, + logits=logits, + hidden_states=model_outputs.logits, + ) + + def get_input_spec(self): + template_keywords = self.template.extract_template_keywords(self.template.prompt) + input_spec = [ + InputSpec(shape=[None, None], dtype="int64", name="input_ids"), + InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"), + InputSpec(shape=[None, None], dtype="int64", name="position_ids"), + InputSpec(shape=[None, None, None, None], dtype="float32", name="attention_mask"), + ] + if "mask" in template_keywords: + input_spec.append(InputSpec(shape=[None], dtype="int64", name="masked_positions")) + if "soft" in template_keywords: + # Add placeholder for argument `masked_positions` if not exists. + if "mask" not in template_keywords: + input_spec.append(None) + input_spec.append(InputSpec(shape=[None, None], dtype="int64", name="soft_token_ids")) + if "encoder" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], dtype="int64", name="encoder_ids")) + return input_spec diff --git a/paddlenlp/prompt/prompt_tokenizer.py b/paddlenlp/prompt/prompt_tokenizer.py index 8e41162c5ab6..1027c30b99d3 100644 --- a/paddlenlp/prompt/prompt_tokenizer.py +++ b/paddlenlp/prompt/prompt_tokenizer.py @@ -127,7 +127,7 @@ def _create_max_lengths_from_do_truncate(self, part_text: List[str], part_do_tru Create the max sequence length of each part, where the longest part is truncated first. """ text_length = sum([len(x) for x in part_text]) - num_special_token = self.tokenizer.num_special_tokens_to_add() + num_special_token = self.tokenizer.num_special_tokens_to_add(pair=False) max_length = self.max_length - num_special_token if text_length <= max_length: return [None] * len(part_text) diff --git a/paddlenlp/prompt/prompt_utils.py b/paddlenlp/prompt/prompt_utils.py index d230fbf1ab41..0f6e51c3d5e2 100644 --- a/paddlenlp/prompt/prompt_utils.py +++ b/paddlenlp/prompt/prompt_utils.py @@ -86,6 +86,16 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: for key in features[0]: if key not in self.default_model_input_names: values = [b[key] for b in features if key in b] + if key == "target": + batch_size, _ = batch["soft_token_ids"].shape + soft_token_ids = paddle.masked_select(batch["soft_token_ids"], batch["soft_token_ids"] > 0) + soft_token_ids = soft_token_ids.reshape([batch_size, -1]) + _, soft_len = soft_token_ids.shape + input_ids = paddle.concat( + [batch["input_ids"][:, 0].unsqueeze(1), batch["input_ids"][:, soft_len + 1 :]], axis=1 + ) + batch["labels"] = input_ids + continue if len(values) < len(features): continue if key == "masked_positions": diff --git a/paddlenlp/prompt/template.py b/paddlenlp/prompt/template.py index 2c6c5ef52a5c..57b6e6f6134b 100644 --- a/paddlenlp/prompt/template.py +++ b/paddlenlp/prompt/template.py @@ -253,6 +253,8 @@ def encode(self, example: Dict[str, Any]): inputs.append(dict(zip(input_names, value))) input_dict = self.prompt_tokenizer(inputs) + if "target" in example: + input_dict.update({"target": example["target"]}) unused_example = {k: v for k, v in example.items() if k not in self.example_keys} return {**input_dict, **unused_example} diff --git a/paddlenlp/transformers/gpt/tokenizer.py b/paddlenlp/transformers/gpt/tokenizer.py index 63ac00d1281f..99f9ade54ef3 100644 --- a/paddlenlp/transformers/gpt/tokenizer.py +++ b/paddlenlp/transformers/gpt/tokenizer.py @@ -13,16 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os +import shutil from functools import lru_cache -import json import jieba -import shutil import sentencepiece as spm from paddle.utils import try_import -from .. import PretrainedTokenizer, AddedToken +from .. import AddedToken, PretrainedTokenizer __all__ = [ "GPTTokenizer", @@ -200,6 +200,7 @@ def convert_tokens_to_ids(self, tokens): return [self._convert_token_to_id(token) for token in tokens] ''' + ''' def convert_ids_to_tokens(self, ids): """ Converts a single index or a sequence of indices to a token or a @@ -227,6 +228,7 @@ def convert_ids_to_tokens(self, ids): return self._convert_id_to_token(ids) tokens = [self._convert_id_to_token(_id) for _id in ids] return tokens + ''' @property def vocab_size(self): From bafeec1ad2bcb3ee1737391cdff6819fe4c57842 Mon Sep 17 00:00:00 2001 From: tsinghua-zhang Date: Fri, 7 Apr 2023 16:09:51 +0800 Subject: [PATCH 2/5] modefied --- .../prefix-tuning/prompt/lcsts_new.json | 5 -- examples/few_shot/prefix-tuning/run_train.py | 46 +++++++---------- examples/few_shot/prefix-tuning/utils.py | 18 ++----- paddlenlp/prompt/prompt_model.py | 34 ++----------- paddlenlp/prompt/prompt_utils.py | 20 ++++---- paddlenlp/prompt/template.py | 4 +- tests/prompt/test_prompt_model.py | 51 +++++++++++++++++++ 7 files changed, 87 insertions(+), 91 deletions(-) delete mode 100644 examples/few_shot/prefix-tuning/prompt/lcsts_new.json diff --git a/examples/few_shot/prefix-tuning/prompt/lcsts_new.json b/examples/few_shot/prefix-tuning/prompt/lcsts_new.json deleted file mode 100644 index af3fdb637ca7..000000000000 --- a/examples/few_shot/prefix-tuning/prompt/lcsts_new.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "template": [ - {"text": "{'prefix':'文本摘要'}{'text':'source'}{'sep'}{'text':'target', 'token_type': 1}"} - ] -} diff --git a/examples/few_shot/prefix-tuning/run_train.py b/examples/few_shot/prefix-tuning/run_train.py index 88ad394adff9..898803f211f6 100644 --- a/examples/few_shot/prefix-tuning/run_train.py +++ b/examples/few_shot/prefix-tuning/run_train.py @@ -19,7 +19,7 @@ import paddle from paddle.static import InputSpec -from utils import compute_metrics, load_prompt_arguments, new_PromptTrainer +from utils import PromptTrainerForGeneration, compute_metrics from paddlenlp.datasets import load_dataset from paddlenlp.prompt import ( @@ -34,28 +34,10 @@ @dataclass class DataArguments: - prompt_path: str = field(default="prompt/eprstmt.json", metadata={"help": "Path to the defined prompts."}) - prompt_index: int = field(default=0, metadata={"help": "The index of defined prompt for training."}) - augment_type: str = field( - default=None, - metadata={ - "help": "The strategy used for data augmentation, including `swap`, `delete`, `insert`, `subsitute`." - }, - ) - num_augment: str = field( - default=5, metadata={"help": "Number of augmented data per example, which works when `augment_type` is set."} - ) - word_augment_percent: str = field( - default=0.1, - metadata={ - "help": "Percentage of augmented words in sequences, used for `swap`, `delete`, `insert`, `subsitute`." - }, - ) - augment_method: str = field(default="mlm", metadata={"help": "Strategy used for `insert` and `subsitute`."}) - do_label: bool = field( - default=False, metadata={"help": "Whether to label unsupervised data in unlabeled datasets"} + prompt: str = field( + default="{'prefix':'文本摘要'}{'text':'text'}{'sep'}{'text':'labels', 'token_type': 1}", + metadata={"help": "Add prompt.'文本摘要'、'text' variable and 'labels' immutable."}, ) - do_test: bool = field(default=False, metadata={"help": "Whether to evaluate model on public test datasets."}) @dataclass @@ -90,7 +72,6 @@ def main(): # Parse the arguments. parser = PdArgumentParser((ModelArguments, DataArguments, PromptTuningArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() - data_args = load_prompt_arguments(data_args) training_args.generation_max_length = model_args.max_target_length training_args.predict_with_generate = model_args.predict_with_generate @@ -111,27 +92,34 @@ def main(): attention_probs_dropout_prob=model_args.dropout, ) - # Define template for preprocess and verbalizer for postprocess. + # Define template for preprocess. template = PrefixTemplate(data_args.prompt, tokenizer, training_args.max_seq_length, model) logger.info("Using template: {}".format(template.prompt)) # Load datasets. train_ds, dev_ds = load_dataset("lcsts_new") - dev_ds_label = dev_ds.map(lambda x: {x["target"]}, dev_ds) - train_ds, dev_ds = load_dataset("lcsts_new") + + def convert_label_keyword(input_dict): + if "text" not in input_dict: + input_dict["text"] = input_dict.pop("source") + if "labels" not in input_dict: + input_dict["labels"] = input_dict.pop("target") + return input_dict + + train_ds.map(convert_label_keyword) + dev_ds.map(convert_label_keyword) # Initialize the prompt model with the above variables. prompt_model = PromptModelForGeneration( model, template, - verbalizer=None, freeze_plm=training_args.freeze_plm, freeze_dropout=training_args.freeze_dropout, max_predict_len=training_args.generation_max_length, ) - dev_compute_metrics = partial(compute_metrics, tokenizer=tokenizer, labels=dev_ds_label) - trainer = new_PromptTrainer( + dev_compute_metrics = partial(compute_metrics, tokenizer=tokenizer) + trainer = PromptTrainerForGeneration( model=prompt_model, tokenizer=tokenizer, args=training_args, diff --git a/examples/few_shot/prefix-tuning/utils.py b/examples/few_shot/prefix-tuning/utils.py index ae69d5adb20b..d2420fa14049 100644 --- a/examples/few_shot/prefix-tuning/utils.py +++ b/examples/few_shot/prefix-tuning/utils.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import json import numpy as np import paddle @@ -21,18 +20,8 @@ from paddlenlp.prompt import PromptTrainer -def load_prompt_arguments(args): - """ - Load prompt and label words according to prompt index. - """ - with open(args.prompt_path, "r", encoding="utf-8") as fp: - configs = json.load(fp) - args.prompt = configs["template"][args.prompt_index]["text"] - return args - - # Define the metric function. -def compute_metrics(eval_preds, tokenizer, labels): +def compute_metrics(eval_preds, tokenizer): all_preds = [] all_labels = [] @@ -68,7 +57,7 @@ def compute_metrics(eval_preds, tokenizer, labels): return {"rougel": rougel} -class new_PromptTrainer(PromptTrainer): +class PromptTrainerForGeneration(PromptTrainer): def __init__( self, model, @@ -82,7 +71,7 @@ def __init__( callbacks=None, optimizers=(None, None), ): - super(new_PromptTrainer, self).__init__( + super(PromptTrainerForGeneration, self).__init__( model=model, criterion=criterion, args=args, @@ -94,6 +83,7 @@ def __init__( callbacks=callbacks, optimizers=optimizers, ) + self.verbalizer = None def compute_loss(self, model, inputs, return_outputs=False): """ diff --git a/paddlenlp/prompt/prompt_model.py b/paddlenlp/prompt/prompt_model.py index a52e6eb0b485..461e013d11e7 100644 --- a/paddlenlp/prompt/prompt_model.py +++ b/paddlenlp/prompt/prompt_model.py @@ -19,6 +19,7 @@ from ..transformers.generation_utils import GenerationMixin from ..transformers.model_outputs import ( + CausalLMOutputWithCrossAttentions, MaskedLMOutput, MultipleChoiceModelOutput, SequenceClassifierOutput, @@ -171,7 +172,6 @@ def __init__( self, model: paddle.nn.Layer, template: Template, - verbalizer: Optional[Verbalizer] = None, freeze_plm: bool = False, freeze_dropout: bool = False, max_predict_len: int = 32, @@ -179,7 +179,6 @@ def __init__( super(PromptModelForGeneration, self).__init__() self.plm = model self.template = template - self.verbalizer = verbalizer self.freeze_plm = freeze_plm self.freeze_dropout = freeze_dropout if self.freeze_plm: @@ -200,15 +199,12 @@ def forward( input_ids: paddle.Tensor, token_type_ids: Optional[paddle.Tensor] = None, position_ids: Optional[paddle.Tensor] = None, - attention_mask: Optional[paddle.Tensor] = None, - masked_positions: Optional[paddle.Tensor] = None, soft_token_ids: Optional[paddle.Tensor] = None, encoder_ids: Optional[paddle.Tensor] = None, labels: Optional[paddle.Tensor] = None, return_dict: Optional[bool] = None, **kwargs: Dict[str, Any] ): - return_dict = return_dict if return_dict is not None else False if soft_token_ids is None: outputs = self.plm(input_ids) @@ -219,9 +215,7 @@ def forward( "input_ids": input_ids, "token_type_ids": token_type_ids, "position_ids": position_ids, - "masked_positions": masked_positions, "soft_token_ids": soft_token_ids, - "attention_mask": attention_mask, "encoder_ids": encoder_ids, "labels": labels, **kwargs, @@ -229,8 +223,6 @@ def forward( input_dict = self.template.process_batch(input_dict) input_dict = {**input_dict, **kwargs} model_inputs = {k: input_dict[k] for k in input_dict if k in self.forward_keys} - if "masked_positions" in model_inputs: - model_inputs.pop("masked_positions") if "cache" in self.forward_keys: model_inputs["cache"] = [] for i in range(len(model_inputs["past_key_values"])): @@ -251,8 +243,6 @@ def forward( shift_labels = labels[..., 1:] loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=0, reduction="sum") loss = loss_fct(shift_logits.reshape((-1, shift_logits.shape[-1])), shift_labels.reshape((-1,))) - num = int(paddle.count_nonzero(shift_labels.reshape((-1,)))) - loss = loss / num if not return_dict: output = (logits,) @@ -264,27 +254,9 @@ def forward( output = output[0] return output - return MaskedLMOutput( + return CausalLMOutputWithCrossAttentions( loss=loss, logits=logits, + past_key_values=model_outputs.past_key_values, hidden_states=model_outputs.logits, ) - - def get_input_spec(self): - template_keywords = self.template.extract_template_keywords(self.template.prompt) - input_spec = [ - InputSpec(shape=[None, None], dtype="int64", name="input_ids"), - InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"), - InputSpec(shape=[None, None], dtype="int64", name="position_ids"), - InputSpec(shape=[None, None, None, None], dtype="float32", name="attention_mask"), - ] - if "mask" in template_keywords: - input_spec.append(InputSpec(shape=[None], dtype="int64", name="masked_positions")) - if "soft" in template_keywords: - # Add placeholder for argument `masked_positions` if not exists. - if "mask" not in template_keywords: - input_spec.append(None) - input_spec.append(InputSpec(shape=[None, None], dtype="int64", name="soft_token_ids")) - if "encoder" in template_keywords: - input_spec.append(InputSpec(shape=[None, None], dtype="int64", name="encoder_ids")) - return input_spec diff --git a/paddlenlp/prompt/prompt_utils.py b/paddlenlp/prompt/prompt_utils.py index 0f6e51c3d5e2..1b4091feaef2 100644 --- a/paddlenlp/prompt/prompt_utils.py +++ b/paddlenlp/prompt/prompt_utils.py @@ -86,16 +86,6 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: for key in features[0]: if key not in self.default_model_input_names: values = [b[key] for b in features if key in b] - if key == "target": - batch_size, _ = batch["soft_token_ids"].shape - soft_token_ids = paddle.masked_select(batch["soft_token_ids"], batch["soft_token_ids"] > 0) - soft_token_ids = soft_token_ids.reshape([batch_size, -1]) - _, soft_len = soft_token_ids.shape - input_ids = paddle.concat( - [batch["input_ids"][:, 0].unsqueeze(1), batch["input_ids"][:, soft_len + 1 :]], axis=1 - ) - batch["labels"] = input_ids - continue if len(values) < len(features): continue if key == "masked_positions": @@ -122,6 +112,16 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: max_num_label = max([len(x) for x in values]) for index, value in enumerate(values): values[index] = value + [-100] * (max_num_label - len(value)) + if isinstance(values[0], str): + batch_size, _ = batch["soft_token_ids"].shape + soft_token_ids = paddle.masked_select(batch["soft_token_ids"], batch["soft_token_ids"] > 0) + soft_token_ids = soft_token_ids.reshape([batch_size, -1]) + _, soft_len = soft_token_ids.shape + input_ids = paddle.concat( + [batch["input_ids"][:, 0].unsqueeze(1), batch["input_ids"][:, soft_len + 1 :]], axis=1 + ) + batch["labels"] = input_ids + continue elif key != "cls_positions": continue batch[key] = self._convert_to_tensors(values) diff --git a/paddlenlp/prompt/template.py b/paddlenlp/prompt/template.py index 57b6e6f6134b..4ab6df1db55f 100644 --- a/paddlenlp/prompt/template.py +++ b/paddlenlp/prompt/template.py @@ -253,8 +253,8 @@ def encode(self, example: Dict[str, Any]): inputs.append(dict(zip(input_names, value))) input_dict = self.prompt_tokenizer(inputs) - if "target" in example: - input_dict.update({"target": example["target"]}) + if "labels" in example: + input_dict.update({"labels": example["labels"]}) unused_example = {k: v for k, v in example.items() if k not in self.example_keys} return {**input_dict, **unused_example} diff --git a/tests/prompt/test_prompt_model.py b/tests/prompt/test_prompt_model.py index a7d853956dda..f8df4d7c10e8 100644 --- a/tests/prompt/test_prompt_model.py +++ b/tests/prompt/test_prompt_model.py @@ -17,6 +17,7 @@ from paddlenlp.prompt import ( AutoTemplate, PromptDataCollatorWithPadding, + PromptModelForGeneration, PromptModelForSequenceClassification, SoftVerbalizer, ) @@ -24,6 +25,7 @@ AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoTokenizer, + GPTLMHeadModel, ) @@ -116,5 +118,54 @@ def test_efl_with_labels(self): self.assertEqual(model_outputs.hidden_states.shape[0], len(examples)) +class PromptModelTestForGeneration(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.tokenizer = AutoTokenizer.from_pretrained("__internal_testing__/tiny-random-gpt") + cls.tokenizer.pad_token = "" + cls.tokenizer.sep_token = "" + cls.tokenizer.add_tokens("[Space]", special_tokens=True) + cls.model = GPTLMHeadModel.from_pretrained("__internal_testing__/tiny-random-gpt") + cls.num_labels = 2 + + cls.template = AutoTemplate.create_from( + prompt="{'prefix':'文本摘要'}{'text':'text'}{'sep'}{'text':'labels', 'token_type': 1}", + tokenizer=cls.tokenizer, + max_length=512, + model=cls.model, + ) + + cls.data_collator = PromptDataCollatorWithPadding(cls.tokenizer, padding=True, return_tensors="pd") + cls.prompt_model = PromptModelForGeneration(cls.model, cls.template) + + def test_sequence_classification_with_labels(self): + examples = [ + { + "text": "日前,方舟子发文直指林志颖旗下爱碧丽推销假保健品,引起哗然。调查发现,爱碧丽没有自己的生产加工厂。其胶原蛋白饮品无核心研发,全部代工生产。号称有“逆生长”功效的爱碧丽“梦幻奇迹限量组”售价高达1080元,实际成本仅为每瓶4元!", + "labels": "林志颖公司疑涉虚假营销无厂房无研发", + "id": 0, + }, + { + "text": "韩方应对路径可以概括为:企业道歉担责;政府公正不护短;民间祈福关怀。他们深知形象的重要,竭力呵护企业品牌和国家形象。正如有评论,韩国“政府+企业+民众”三位一体式呵护韩国国家形象的“苦心经营”,的确有值得我们借鉴之处。", + "labels": "从韩亚航空事故看其应对路径", + "id": 1, + }, + ] + encoded_examples = [self.template(i) for i in examples] + loss, logits, hidden_states = self.prompt_model( + **self.data_collator(encoded_examples), return_hidden_states=True + ) + self.assertIsNotNone(loss) + self.assertEqual(logits.shape[0], len(examples)) + self.assertEqual(hidden_states.shape[0], len(examples)) + + model_outputs = self.prompt_model( + **self.data_collator(encoded_examples), return_dict=True, return_hidden_states=True + ) + self.assertIsNotNone(model_outputs.loss) + self.assertEqual(model_outputs.logits.shape[0], len(examples)) + self.assertEqual(model_outputs.hidden_states.shape[0], len(examples)) + + if __name__ == "__main__": unittest.main() From 7fe8b4084365de739aceae04e35a752d96b7514b Mon Sep 17 00:00:00 2001 From: tsinghua-zhang Date: Fri, 14 Apr 2023 16:22:42 +0800 Subject: [PATCH 3/5] modefied_generate --- examples/few_shot/prefix-tuning/run_train.py | 33 ++---- examples/few_shot/prefix-tuning/utils.py | 28 ++--- paddlenlp/prompt/prompt_model.py | 102 ++++++++++++++++--- paddlenlp/prompt/prompt_tokenizer.py | 2 + paddlenlp/prompt/prompt_utils.py | 10 -- paddlenlp/prompt/template.py | 2 - tests/prompt/test_prompt_model.py | 30 +++--- 7 files changed, 118 insertions(+), 89 deletions(-) diff --git a/examples/few_shot/prefix-tuning/run_train.py b/examples/few_shot/prefix-tuning/run_train.py index 898803f211f6..2fefccbac99f 100644 --- a/examples/few_shot/prefix-tuning/run_train.py +++ b/examples/few_shot/prefix-tuning/run_train.py @@ -12,13 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from dataclasses import dataclass, field from functools import partial from typing import Optional import paddle -from paddle.static import InputSpec from utils import PromptTrainerForGeneration, compute_metrics from paddlenlp.datasets import load_dataset @@ -35,9 +33,10 @@ @dataclass class DataArguments: prompt: str = field( - default="{'prefix':'文本摘要'}{'text':'text'}{'sep'}{'text':'labels', 'token_type': 1}", - metadata={"help": "Add prompt.'文本摘要'、'text' variable and 'labels' immutable."}, + default="{'prefix':'None'}{'text':'text'}{'sep'}{'text':'labels', 'token_type': 1}", + metadata={"help": "Add prompt.'prefix'、'text' variable and 'text':'labels' immutable."}, ) + task_name: str = field(default="dureader_qg", metadata={"help": "The name of task."}) @dataclass @@ -53,11 +52,11 @@ class ModelArguments: metadata={"help": ("Whether to generate in predcit.")}, ) num_beams: Optional[int] = field( - default=1, + default=2, metadata={"help": ("The number of beams to use in beam search.")}, ) max_target_length: Optional[int] = field( - default=64, + default=16, metadata={ "help": ( "The maximum total sequence length for target text after " @@ -97,11 +96,11 @@ def main(): logger.info("Using template: {}".format(template.prompt)) # Load datasets. - train_ds, dev_ds = load_dataset("lcsts_new") + train_ds, dev_ds = load_dataset(data_args.task_name, splits=["train", "dev"]) def convert_label_keyword(input_dict): if "text" not in input_dict: - input_dict["text"] = input_dict.pop("source") + input_dict["text"] = input_dict.pop("title") + tokenizer.sep_token + input_dict.pop("source") if "labels" not in input_dict: input_dict["labels"] = input_dict.pop("target") return input_dict @@ -115,7 +114,6 @@ def convert_label_keyword(input_dict): template, freeze_plm=training_args.freeze_plm, freeze_dropout=training_args.freeze_dropout, - max_predict_len=training_args.generation_max_length, ) dev_compute_metrics = partial(compute_metrics, tokenizer=tokenizer) @@ -142,23 +140,6 @@ def convert_label_keyword(input_dict): eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) - # Export static model. - if training_args.do_export: - template = prompt_model.template - template_keywords = template.extract_template_keywords(template.prompt) - input_spec = [ - InputSpec(shape=[None, None], dtype="int64"), # input_ids, - InputSpec(shape=[None, None], dtype="int64"), # token_type_ids - InputSpec(shape=[None, None], dtype="int64"), # position_ids - InputSpec(shape=[None, None, None, None], dtype="float32"), # attention_mask - InputSpec(shape=[None], dtype="int64"), # masked_positions - InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids - ] - if "encoder" in template_keywords: - input_spec.append(InputSpec(shape=[None, None], dtype="int64")) # encoder_ids - export_path = os.path.join(training_args.output_dir, "export") - trainer.export_model(export_path, input_spec=input_spec, export_type=model_args.export_type) - if __name__ == "__main__": main() diff --git a/examples/few_shot/prefix-tuning/utils.py b/examples/few_shot/prefix-tuning/utils.py index d2420fa14049..6c258193f278 100644 --- a/examples/few_shot/prefix-tuning/utils.py +++ b/examples/few_shot/prefix-tuning/utils.py @@ -27,9 +27,9 @@ def compute_metrics(eval_preds, tokenizer): all_labels = [] labels = eval_preds.label_ids preds = eval_preds.predictions - all_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + all_preds.extend(tokenizer.convert_ids_to_string(pred.tolist()) for pred in preds) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) - all_labels.extend(tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + all_labels.extend(tokenizer.convert_ids_to_string(label.tolist()) for label in labels) assert len(all_preds) == len(all_labels), ( "The length of pred_responses should be equal to the length of " @@ -182,14 +182,10 @@ def prediction_step( ) has_labels = "labels" in inputs - inputs = self._prepare_inputs(inputs) + labels = inputs["labels"] + # inputs = self._prepare_inputs(inputs) gen_kwargs = self._gen_kwargs.copy() - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: - gen_kwargs["max_length"] = self.model.config.max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams - ) if "attention_mask" in inputs: gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) @@ -201,6 +197,7 @@ def prediction_step( **gen_kwargs, use_cache=True, use_fp16_decoding=True, + repetition_penalty=2.0, ) # different from hf returns: tuple[Tensor]: It is a tuple contains two elements: ids and scores. @@ -225,17 +222,6 @@ def prediction_step( if self.args.prediction_loss_only: return (loss, None, None) - if has_labels: - labels = inputs["labels"] - if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]: - labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) - elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < ( - gen_kwargs["max_new_tokens"] + 1 - ): - labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1)) - else: - labels = None - return (loss, generated_tokens, labels) def _pad_tensors_to_max_len(self, tensor, max_length): @@ -245,8 +231,8 @@ def _pad_tensors_to_max_len(self, tensor, max_length): self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id ) else: - if self.model.config.pad_token_id is not None: - pad_token_id = self.model.config.pad_token_id + if self.tokenizer.pad_token_id is not None: + pad_token_id = self.tokenizer.pad_token_id else: raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors") # paddle.ones need to support device args. diff --git a/paddlenlp/prompt/prompt_model.py b/paddlenlp/prompt/prompt_model.py index 461e013d11e7..3181216dcb32 100644 --- a/paddlenlp/prompt/prompt_model.py +++ b/paddlenlp/prompt/prompt_model.py @@ -12,15 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, Optional +import inspect +from typing import Any, Dict, List, Optional import paddle +import paddle.nn.functional as F from paddle.static import InputSpec from ..transformers.generation_utils import GenerationMixin from ..transformers.model_outputs import ( CausalLMOutputWithCrossAttentions, MaskedLMOutput, + ModelOutput, MultipleChoiceModelOutput, SequenceClassifierOutput, ) @@ -174,7 +177,6 @@ def __init__( template: Template, freeze_plm: bool = False, freeze_dropout: bool = False, - max_predict_len: int = 32, ): super(PromptModelForGeneration, self).__init__() self.plm = model @@ -189,10 +191,10 @@ def __init__( self.forward_keys = signature(self.plm.forward) self._mask_token_id = self.template.tokenizer.mask_token_id self._pad_token_id = self.template.tokenizer.pad_token_id - if isinstance(self.template, PrefixTemplate): - self.plm = self.template.process_model(self.plm) - self.forward_keys.append("past_key_values") - self.max_predict_len = paddle.to_tensor(max_predict_len, dtype="int32") + if not isinstance(self.template, PrefixTemplate): + raise TypeError(f"{self.__class__.__name__} is not compatible with {self.template.__class__.__name__} ") + self.plm = self.template.process_model(self.plm) + self.forward_keys.append("past_key_values") def forward( self, @@ -234,15 +236,18 @@ def forward( ) ) model_inputs.pop("past_key_values") + model_inputs.pop("labels") model_outputs = self.plm(**model_inputs, return_dict=True, use_cache=True) - logits = model_outputs.logits - shift_logits = logits[..., :-1, :] - mask = input_dict["token_type_ids"] == 1 - labels = labels * mask - shift_labels = labels[..., 1:] - loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=0, reduction="sum") - loss = loss_fct(shift_logits.reshape((-1, shift_logits.shape[-1])), shift_labels.reshape((-1,))) + + loss = None + if labels is not None: + shift_labels = labels[..., 1:] + shift_logits = logits[..., : shift_labels.shape[1], :] + loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=-100, reduction="mean") + loss = loss_fct(shift_logits.reshape((-1, shift_logits.shape[-1])), shift_labels.reshape((-1,))).reshape( + [-1] + ) if not return_dict: output = (logits,) @@ -260,3 +265,74 @@ def forward( past_key_values=model_outputs.past_key_values, hidden_states=model_outputs.logits, ) + + def greedy_search(self, input_ids, logits_processors, max_length, pad_token_id, eos_token_id, **model_kwargs): + logits_processors = logits_processors if logits_processors is not None else LogitsProcessorList() + + batch_size, cur_len = input_ids.shape + origin_len = cur_len + unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool") + scores = paddle.full([batch_size, 1], 0.0, dtype=paddle.get_default_dtype()) + while cur_len < max_length: + # prepare model inputs & get model output + if "use_cache" in model_kwargs: + del model_kwargs["use_cache"] + if "attention_mask" in model_kwargs: + del model_kwargs["attention_mask"] + if "labels" in model_kwargs: + del model_kwargs["labels"] + outputs = self(input_ids, **model_kwargs) + outputs = outputs[1] if isinstance(outputs, tuple) else outputs + + # To hundle the logits is a ModelOutput + logits = outputs.logits if isinstance(outputs, ModelOutput) else outputs + + # [batch_size, vocab_size] + next_token_logits = logits[:, -1, :] + + # pre-process distribution + next_token_logits = self.adjust_logits_during_generation(next_token_logits) + next_tokens_scores = logits_processors(input_ids, next_token_logits) + # greedy + probs = F.softmax(next_tokens_scores) + probs = paddle.log(probs) + next_tokens = paddle.argmax(probs, axis=-1).unsqueeze(-1) + next_scores = paddle.index_sample(probs.astype("float32"), next_tokens) + + if eos_token_id is not None: + next_tokens = paddle.where(unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id)) + + scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag) + + cur_len += 1 + input_ids = paddle.concat([input_ids, next_tokens], axis=1) + + if eos_token_id is not None: + unfinished_flag = paddle.logical_and(unfinished_flag, next_tokens != eos_token_id) + + # Stop when there is a in all sentences + if not paddle.any(unfinished_flag): + break + + model_kwargs = self.update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder + ) + model_kwargs["soft_token_ids"] = paddle.concat( + [model_kwargs["soft_token_ids"], paddle.to_tensor([[0]])], axis=1 + ) + + return input_ids[:, origin_len:], scores + + +class LogitsProcessorList(List): + def __call__(self, input_ids, logits, **kwargs): + for processor in self: + processor_args = inspect.signature(processor.__call__).parameters + if len(processor_args) > 2: + assert all( + arg in kwargs for arg in list(processor_args.keys())[2:] + ), f"The parameters don't match for {processor.__class__}" + logits = processor(input_ids, logits, **kwargs) + else: + logits = processor(input_ids, logits) + return logits diff --git a/paddlenlp/prompt/prompt_tokenizer.py b/paddlenlp/prompt/prompt_tokenizer.py index 1027c30b99d3..ee7d45fef5c0 100644 --- a/paddlenlp/prompt/prompt_tokenizer.py +++ b/paddlenlp/prompt/prompt_tokenizer.py @@ -61,6 +61,8 @@ def __call__(self, inputs: List[Dict[str, Any]]): else: input_ids = orig_input_ids[index][: max_lengths[index]] encoded_inputs["soft_token_ids"].append([0] * len(input_ids)) + if part["token_types"] == 1: + encoded_inputs["labels"].append(input_ids) else: input_ids = soft_token_ids encoded_inputs["soft_token_ids"].append(soft_token_ids) diff --git a/paddlenlp/prompt/prompt_utils.py b/paddlenlp/prompt/prompt_utils.py index 1b4091feaef2..d230fbf1ab41 100644 --- a/paddlenlp/prompt/prompt_utils.py +++ b/paddlenlp/prompt/prompt_utils.py @@ -112,16 +112,6 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: max_num_label = max([len(x) for x in values]) for index, value in enumerate(values): values[index] = value + [-100] * (max_num_label - len(value)) - if isinstance(values[0], str): - batch_size, _ = batch["soft_token_ids"].shape - soft_token_ids = paddle.masked_select(batch["soft_token_ids"], batch["soft_token_ids"] > 0) - soft_token_ids = soft_token_ids.reshape([batch_size, -1]) - _, soft_len = soft_token_ids.shape - input_ids = paddle.concat( - [batch["input_ids"][:, 0].unsqueeze(1), batch["input_ids"][:, soft_len + 1 :]], axis=1 - ) - batch["labels"] = input_ids - continue elif key != "cls_positions": continue batch[key] = self._convert_to_tensors(values) diff --git a/paddlenlp/prompt/template.py b/paddlenlp/prompt/template.py index 4ab6df1db55f..2c6c5ef52a5c 100644 --- a/paddlenlp/prompt/template.py +++ b/paddlenlp/prompt/template.py @@ -253,8 +253,6 @@ def encode(self, example: Dict[str, Any]): inputs.append(dict(zip(input_names, value))) input_dict = self.prompt_tokenizer(inputs) - if "labels" in example: - input_dict.update({"labels": example["labels"]}) unused_example = {k: v for k, v in example.items() if k not in self.example_keys} return {**input_dict, **unused_example} diff --git a/tests/prompt/test_prompt_model.py b/tests/prompt/test_prompt_model.py index f8df4d7c10e8..2d8cd6b4d863 100644 --- a/tests/prompt/test_prompt_model.py +++ b/tests/prompt/test_prompt_model.py @@ -119,26 +119,22 @@ def test_efl_with_labels(self): class PromptModelTestForGeneration(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.tokenizer = AutoTokenizer.from_pretrained("__internal_testing__/tiny-random-gpt") - cls.tokenizer.pad_token = "" - cls.tokenizer.sep_token = "" - cls.tokenizer.add_tokens("[Space]", special_tokens=True) - cls.model = GPTLMHeadModel.from_pretrained("__internal_testing__/tiny-random-gpt") - cls.num_labels = 2 - - cls.template = AutoTemplate.create_from( - prompt="{'prefix':'文本摘要'}{'text':'text'}{'sep'}{'text':'labels', 'token_type': 1}", - tokenizer=cls.tokenizer, + def test_generation_with_labels(self): + self.tokenizer = AutoTokenizer.from_pretrained("__internal_testing__/tiny-random-gpt") + self.tokenizer.pad_token = "" + self.tokenizer.sep_token = "" + self.tokenizer.add_tokens("[Space]", special_tokens=True) + self.model = GPTLMHeadModel.from_pretrained("__internal_testing__/tiny-random-gpt") + + self.template = AutoTemplate.create_from( + prompt="{'prefix':'文本摘要', 'encoder': 'mlp'}{'text':'text'}{'sep'}{'text':'labels', 'token_type': 1}", + tokenizer=self.tokenizer, max_length=512, - model=cls.model, + model=self.model, ) - cls.data_collator = PromptDataCollatorWithPadding(cls.tokenizer, padding=True, return_tensors="pd") - cls.prompt_model = PromptModelForGeneration(cls.model, cls.template) - - def test_sequence_classification_with_labels(self): + self.data_collator = PromptDataCollatorWithPadding(self.tokenizer, padding=True, return_tensors="pd") + self.prompt_model = PromptModelForGeneration(self.model, self.template) examples = [ { "text": "日前,方舟子发文直指林志颖旗下爱碧丽推销假保健品,引起哗然。调查发现,爱碧丽没有自己的生产加工厂。其胶原蛋白饮品无核心研发,全部代工生产。号称有“逆生长”功效的爱碧丽“梦幻奇迹限量组”售价高达1080元,实际成本仅为每瓶4元!", From aaae88f2cb8f4f8009da35310bf98d00779092b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Ctsinghua-zhang=E2=80=9D?= Date: Wed, 19 Apr 2023 10:28:13 +0800 Subject: [PATCH 4/5] modefied --- examples/few_shot/prefix-tuning/run_train.py | 6 +- examples/few_shot/prefix-tuning/utils.py | 86 +---------------- paddlenlp/prompt/prompt_model.py | 98 ++++++-------------- 3 files changed, 36 insertions(+), 154 deletions(-) diff --git a/examples/few_shot/prefix-tuning/run_train.py b/examples/few_shot/prefix-tuning/run_train.py index 2fefccbac99f..b22ba81af976 100644 --- a/examples/few_shot/prefix-tuning/run_train.py +++ b/examples/few_shot/prefix-tuning/run_train.py @@ -33,7 +33,7 @@ @dataclass class DataArguments: prompt: str = field( - default="{'prefix':'None'}{'text':'text'}{'sep'}{'text':'labels', 'token_type': 1}", + default="{'prefix':'根据回答和原文得到问题', 'length':50}{'text':'text'}{'sep'}{'text':'labels', 'token_type': 1, 'truncate': False}", metadata={"help": "Add prompt.'prefix'、'text' variable and 'text':'labels' immutable."}, ) task_name: str = field(default="dureader_qg", metadata={"help": "The name of task."}) @@ -100,9 +100,9 @@ def main(): def convert_label_keyword(input_dict): if "text" not in input_dict: - input_dict["text"] = input_dict.pop("title") + tokenizer.sep_token + input_dict.pop("source") + input_dict["text"] = ("答案:" + input_dict.pop("title") + "," + "上下文:" + input_dict.pop("source"))[:400] if "labels" not in input_dict: - input_dict["labels"] = input_dict.pop("target") + input_dict["labels"] = "在已知答案的前提下,问题:" + input_dict.pop("target")[:20] return input_dict train_ds.map(convert_label_keyword) diff --git a/examples/few_shot/prefix-tuning/utils.py b/examples/few_shot/prefix-tuning/utils.py index 6c258193f278..545ec4144bb7 100644 --- a/examples/few_shot/prefix-tuning/utils.py +++ b/examples/few_shot/prefix-tuning/utils.py @@ -85,70 +85,6 @@ def __init__( ) self.verbalizer = None - def compute_loss(self, model, inputs, return_outputs=False): - """ - How the loss is computed by Trainer. By default, all models return the loss in the first element. - - Subclass and override for custom behavior. - """ - outputs = model(**inputs) - - # Save past state if it exists - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - # print(outputs[0]) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - # print(outputs[0], outputs.loss) - # URGENT - # print('compute_loss', outputs[0]) - loss = outputs[0] - - return (loss, outputs) if return_outputs else loss - - def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval", **gen_kwargs): - """ - Run evaluation and returns metrics. - - The calling script will be responsible for providing a method to compute metrics, as they are task-dependent - (pass it to the init `compute_metrics` argument). - - You can also subclass and override this method to inject custom behavior. - - Args: - eval_dataset (`Dataset`, *optional*): - Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns - not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` - method. - ignore_keys (`List[str]`, *optional*): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - metric_key_prefix (`str`, *optional*, defaults to `"eval"`): - An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "eval_bleu" if the prefix is `"eval"` (default) - max_length (`int`, *optional*): - The maximum target length to use when predicting with the generate method. - num_beams (`int`, *optional*): - Number of beams for beam search that will be used when predicting with the generate method. 1 means no - beam search. - gen_kwargs: - Additional `generate` specific kwargs. - - Returns: - A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The - dictionary also contains the epoch number which comes from the training state. - """ - - gen_kwargs = gen_kwargs.copy() - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: - gen_kwargs["max_length"] = self.args.generation_max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams - ) - self._gen_kwargs = gen_kwargs - - return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) - def prediction_step( self, model, @@ -185,31 +121,17 @@ def prediction_step( labels = inputs["labels"] # inputs = self._prepare_inputs(inputs) - gen_kwargs = self._gen_kwargs.copy() - - if "attention_mask" in inputs: - gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) - if "global_attention_mask" in inputs: - gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None) - + max_length = 32 generated_tokens = self.model.generate( - **inputs, - **gen_kwargs, - use_cache=True, - use_fp16_decoding=True, - repetition_penalty=2.0, + model_kwargs=inputs, ) # different from hf returns: tuple[Tensor]: It is a tuple contains two elements: ids and scores. if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] # in case the batch is shorter than max length, the output should be padded - if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) - elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < ( - gen_kwargs["max_new_tokens"] + 1 - ): - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1) + if max_length is not None and generated_tokens.shape[-1] < max_length: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, max_length) with paddle.no_grad(): if has_labels: diff --git a/paddlenlp/prompt/prompt_model.py b/paddlenlp/prompt/prompt_model.py index 3181216dcb32..1a8ef68fbf64 100644 --- a/paddlenlp/prompt/prompt_model.py +++ b/paddlenlp/prompt/prompt_model.py @@ -12,18 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional import paddle -import paddle.nn.functional as F from paddle.static import InputSpec -from ..transformers.generation_utils import GenerationMixin from ..transformers.model_outputs import ( CausalLMOutputWithCrossAttentions, MaskedLMOutput, - ModelOutput, MultipleChoiceModelOutput, SequenceClassifierOutput, ) @@ -166,7 +162,7 @@ def get_input_spec(self): return input_spec -class PromptModelForGeneration(paddle.nn.Layer, GenerationMixin): +class PromptModelForGeneration(paddle.nn.Layer): """ PromptModel for classification tasks. """ @@ -195,6 +191,7 @@ def __init__( raise TypeError(f"{self.__class__.__name__} is not compatible with {self.template.__class__.__name__} ") self.plm = self.template.process_model(self.plm) self.forward_keys.append("past_key_values") + self.base_model_prepare_inputs_for_generation = self.plm.prepare_inputs_for_generation def forward( self, @@ -266,73 +263,36 @@ def forward( hidden_states=model_outputs.logits, ) - def greedy_search(self, input_ids, logits_processors, max_length, pad_token_id, eos_token_id, **model_kwargs): - logits_processors = logits_processors if logits_processors is not None else LogitsProcessorList() + def generate(self, model_kwargs): + self.plm.prepare_inputs_for_generation = self.prepare_inputs_for_generation + generated_tokens = self.plm.generate(**model_kwargs) + return generated_tokens - batch_size, cur_len = input_ids.shape - origin_len = cur_len - unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool") - scores = paddle.full([batch_size, 1], 0.0, dtype=paddle.get_default_dtype()) - while cur_len < max_length: - # prepare model inputs & get model output - if "use_cache" in model_kwargs: - del model_kwargs["use_cache"] - if "attention_mask" in model_kwargs: - del model_kwargs["attention_mask"] - if "labels" in model_kwargs: - del model_kwargs["labels"] - outputs = self(input_ids, **model_kwargs) - outputs = outputs[1] if isinstance(outputs, tuple) else outputs + def prepare_inputs_for_generation(self, input_ids, use_cache=False, cache=None, **kwargs): - # To hundle the logits is a ModelOutput - logits = outputs.logits if isinstance(outputs, ModelOutput) else outputs - - # [batch_size, vocab_size] - next_token_logits = logits[:, -1, :] - - # pre-process distribution - next_token_logits = self.adjust_logits_during_generation(next_token_logits) - next_tokens_scores = logits_processors(input_ids, next_token_logits) - # greedy - probs = F.softmax(next_tokens_scores) - probs = paddle.log(probs) - next_tokens = paddle.argmax(probs, axis=-1).unsqueeze(-1) - next_scores = paddle.index_sample(probs.astype("float32"), next_tokens) - - if eos_token_id is not None: - next_tokens = paddle.where(unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id)) - - scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag) - - cur_len += 1 - input_ids = paddle.concat([input_ids, next_tokens], axis=1) - - if eos_token_id is not None: - unfinished_flag = paddle.logical_and(unfinished_flag, next_tokens != eos_token_id) - - # Stop when there is a in all sentences - if not paddle.any(unfinished_flag): - break - - model_kwargs = self.update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder - ) + model_kwargs = self.base_model_prepare_inputs_for_generation(input_ids, cache=None, **kwargs) + model_kwargs["soft_token_ids"] = kwargs.get("soft_token_ids", None) + model_kwargs["token_type_ids"] = kwargs.get("token_type_ids", None) + model_kwargs["encoder_ids"] = kwargs.get("encoder_ids", None) + len_dif = len(model_kwargs["token_type_ids"][0]) - len(model_kwargs["soft_token_ids"][0]) + for _ in range(len_dif): model_kwargs["soft_token_ids"] = paddle.concat( [model_kwargs["soft_token_ids"], paddle.to_tensor([[0]])], axis=1 ) + input_dict = self.template.process_batch(model_kwargs) + model_inputs = {k: input_dict[k] for k in input_dict if k in self.forward_keys} + if "cache" in self.forward_keys: + model_inputs["cache"] = [] + for i in range(len(model_inputs["past_key_values"])): + from paddlenlp.transformers.gpt.modeling import MultiHeadAttention - return input_ids[:, origin_len:], scores - + model_inputs["cache"].append( + MultiHeadAttention.Cache( + k=model_inputs["past_key_values"][i][0], v=model_inputs["past_key_values"][i][1] + ) + ) + model_inputs.pop("past_key_values") + model_inputs["use_cache"] = True + model_inputs["return_dict"] = True -class LogitsProcessorList(List): - def __call__(self, input_ids, logits, **kwargs): - for processor in self: - processor_args = inspect.signature(processor.__call__).parameters - if len(processor_args) > 2: - assert all( - arg in kwargs for arg in list(processor_args.keys())[2:] - ), f"The parameters don't match for {processor.__class__}" - logits = processor(input_ids, logits, **kwargs) - else: - logits = processor(input_ids, logits) - return logits + return model_inputs From a8a7011db893a5beed6982dd5faa8d60ded0e1fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Ctsinghua-zhang=E2=80=9D?= Date: Wed, 19 Apr 2023 19:00:48 +0800 Subject: [PATCH 5/5] modefied_for_test --- paddlenlp/prompt/prompt_model.py | 5 ++--- paddlenlp/prompt/prompt_tokenizer.py | 11 +++++++++-- paddlenlp/prompt/template.py | 3 ++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/paddlenlp/prompt/prompt_model.py b/paddlenlp/prompt/prompt_model.py index 1a8ef68fbf64..2707578a0309 100644 --- a/paddlenlp/prompt/prompt_model.py +++ b/paddlenlp/prompt/prompt_model.py @@ -263,13 +263,12 @@ def forward( hidden_states=model_outputs.logits, ) - def generate(self, model_kwargs): + def generate(self, model_kwargs, **kwargs): self.plm.prepare_inputs_for_generation = self.prepare_inputs_for_generation - generated_tokens = self.plm.generate(**model_kwargs) + generated_tokens = self.plm.generate(**model_kwargs, **kwargs) return generated_tokens def prepare_inputs_for_generation(self, input_ids, use_cache=False, cache=None, **kwargs): - model_kwargs = self.base_model_prepare_inputs_for_generation(input_ids, cache=None, **kwargs) model_kwargs["soft_token_ids"] = kwargs.get("soft_token_ids", None) model_kwargs["token_type_ids"] = kwargs.get("token_type_ids", None) diff --git a/paddlenlp/prompt/prompt_tokenizer.py b/paddlenlp/prompt/prompt_tokenizer.py index ee7d45fef5c0..2986b80409c5 100644 --- a/paddlenlp/prompt/prompt_tokenizer.py +++ b/paddlenlp/prompt/prompt_tokenizer.py @@ -43,6 +43,15 @@ def __call__(self, inputs: List[Dict[str, Any]]): # Create input_ids. soft_token_ids = part.get("soft_tokens", None) if soft_token_ids is None or len(soft_token_ids) == 1 and soft_token_ids[0] == 0: + if "generator_labels" in part: + # import pdb; pdb.set_trace() + encoded_inputs["labels"].append( + self.tokenizer.encode( + part["generator_labels"], add_special_tokens=False, return_token_type_ids=False + )["input_ids"] + ) + inputs.remove(part) + continue orig_input_ids.append( self.tokenizer.encode(part["text"], add_special_tokens=False, return_token_type_ids=False)[ "input_ids" @@ -61,8 +70,6 @@ def __call__(self, inputs: List[Dict[str, Any]]): else: input_ids = orig_input_ids[index][: max_lengths[index]] encoded_inputs["soft_token_ids"].append([0] * len(input_ids)) - if part["token_types"] == 1: - encoded_inputs["labels"].append(input_ids) else: input_ids = soft_token_ids encoded_inputs["soft_token_ids"].append(soft_token_ids) diff --git a/paddlenlp/prompt/template.py b/paddlenlp/prompt/template.py index 2c6c5ef52a5c..266ea7132cf5 100644 --- a/paddlenlp/prompt/template.py +++ b/paddlenlp/prompt/template.py @@ -251,7 +251,8 @@ def encode(self, example: Dict[str, Any]): inputs = [] for value in list(zip(*input_values)): inputs.append(dict(zip(input_names, value))) - + if "labels" in example and isinstance(example["labels"], str): + inputs.append({"generator_labels": example["labels"], "do_truncate": False}) input_dict = self.prompt_tokenizer(inputs) unused_example = {k: v for k, v in example.items() if k not in self.example_keys}