[autonlp] add text classifcation taskflow (#4815)

lugimzzz · web-flow · commit bb76b09d637e · 2023-02-17T14:00:46.000+08:00
* from train to infer

* add taskflow arguments

* update

* fix

* fix

* fix

* fix
diff --git a/paddlenlp/experimental/autonlp/auto_trainer_base.py b/paddlenlp/experimental/autonlp/auto_trainer_base.py
@@ -14,7 +14,6 @@
 import copy
 import datetime
 import os
-import shutil
 from abc import ABCMeta, abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Union
 
@@ -49,8 +48,9 @@ class AutoTrainerBase(metaclass=ABCMeta):
                 use verbosity > 0 to set stop the workers from logging to the driver.
     """
 
-    training_path = "training"
-    export_path = "exported_model"
+    training_path = "training_checkpoints"  # filepath for Trainer's training checkpoints
+    save_path = "trained_model"  # filepath for the trained dygraph model
+    export_path = "exported_model"  # filepath for the exported static model
     results_filename = "experiment_results.csv"
 
     def __init__(
@@ -143,10 +143,8 @@ def export(self, export_path, trial_id=None):
             export_path (str, required): the filepath to export to
             trial_id (int, required): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
         """
-        model_result = self._get_model_result(trial_id=trial_id)
-        exported_model_path = os.path.join(model_result.log_dir, self.export_path)
-        shutil.copytree(exported_model_path, export_path)
-        logger.info(f"Exported to {export_path}")
+
+        raise NotImplementedError
 
     @abstractmethod
     def to_taskflow(self, trial_id=None):
diff --git a/paddlenlp/experimental/autonlp/text_classification.py b/paddlenlp/experimental/autonlp/text_classification.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import copy
 import functools
+import json
 import os
 import shutil
 from typing import Any, Callable, Dict, List
@@ -40,7 +41,9 @@
     AutoModelForSequenceClassification,
     AutoTokenizer,
     PretrainedTokenizer,
+    export_model,
 )
+from paddlenlp.utils.log import logger
 
 from .auto_trainer_base import AutoTrainerBase
 
@@ -61,9 +64,8 @@ class AutoTrainerForTextClassification(AutoTrainerBase):
             language (string, required): language of the text.
             output_dir (str, optional): Output directory for the experiments, defaults to "autpnlp_results".
             id2label(dict(int,string)): The dictionary to map the predictions from class ids to class names.
-            verbosity: (int, optional): controls the verbosity of the run. Defaults to 1, which let the workers log to the driver.To reduce the amount of logs,
-                use verbosity > 0 to set stop the workers from logging to the driver.
-
+            multilabel_threshold (float): The probability threshold used for the multi_label setup. Only effective if model = "multi_label". Defaults to 0.5.
+            verbosity: (int, optional): controls the verbosity of the run. Defaults to 1, which let the workers log to the driver.To reduce the amount of logs, use verbosity > 0 to set stop the workers from logging to the driver.
     """
 
     def __init__(
@@ -88,6 +90,7 @@ def __init__(
         self.text_column = text_column
         self.label_column = label_column
         self.id2label = self.kwargs.get("id2label", None)
+        self.multilabel_threshold = self.kwargs.get("multilabel_threshold", 0.5)
         if problem_type in ["multi_label", "multi_class"]:
             self.problem_type = problem_type
         else:
@@ -244,8 +247,12 @@ def _data_checks_and_inference(self):
                                 raise ValueError(
                                     f"Label {label} is not found in the user-provided id2label argument: {self.id2label}"
                                 )
+        id2label_path = os.path.join(self.output_dir, "id2label.json")
+        with open(id2label_path, "w", encoding="utf-8") as f:
+            json.dump(self.id2label, f, ensure_ascii=False)
+        logger.info(f"Exported id2label to {id2label_path}")
 
-    def _construct_trainer(self, config) -> Trainer:
+    def _construct_trainer(self, config, eval_dataset=None) -> Trainer:
         if "EarlyStoppingCallback.early_stopping_patience" in config:
             callbacks = [
                 EarlyStoppingCallback(early_stopping_patience=config["EarlyStoppingCallback.early_stopping_patience"])
@@ -262,7 +269,10 @@ def _construct_trainer(self, config) -> Trainer:
                 max_length=model.config.max_position_embeddings,  # truncate to the max length allowed by the model
             )
             processed_train_dataset = copy.deepcopy(self.train_dataset).map(trans_func, lazy=False)
-            processed_eval_dataset = copy.deepcopy(self.eval_dataset).map(trans_func, lazy=False)
+            if eval_dataset is None:
+                processed_eval_dataset = copy.deepcopy(self.eval_dataset).map(trans_func, lazy=False)
+            else:
+                processed_eval_dataset = copy.deepcopy(eval_dataset).map(trans_func, lazy=False)
             training_args = self._override_hp(config, self._default_training_argument)
             trainer = Trainer(
                 model=model,
@@ -279,7 +289,11 @@ def _construct_trainer(self, config) -> Trainer:
             max_length = config.get("PreprocessArguments.max_length", 128)
             tokenizer = AutoTokenizer.from_pretrained(model_path)
             processed_train_dataset = copy.deepcopy(self.train_dataset).map(self._preprocess_labels, lazy=False)
-            processed_eval_dataset = copy.deepcopy(self.eval_dataset).map(self._preprocess_labels, lazy=False)
+            if eval_dataset is None:
+                processed_eval_dataset = copy.deepcopy(self.eval_dataset).map(self._preprocess_labels, lazy=False)
+            else:
+                processed_eval_dataset = copy.deepcopy(eval_dataset).map(self._preprocess_labels, lazy=False)
+
             model = AutoModelForMaskedLM.from_pretrained(model_path)
             template = AutoTemplate.create_from(
                 prompt=config["template.prompt"],
@@ -323,12 +337,8 @@ def trainable(config):
             trainer = self._construct_trainer(config)
             trainer.train()
             eval_metrics = trainer.evaluate()
-            if config["trainer_type"] == "PromptTrainer":
-                # It's difficult to load back the prompt model as a dynamic model due to lack of AutoModel support now
-                # We directly export a static model instead of a dynamic model
-                trainer.export_model(self.export_path)
-            else:
-                trainer.save_model(self.export_path)
+            trainer.save_model(self.save_path)
+
             if os.path.exists(self.training_path):
                 logger.info("Removing training checkpoints to conserve disk space")
                 shutil.rmtree(self.training_path)
@@ -347,22 +357,13 @@ def evaluate(self, trial_id=None, eval_dataset=None):
         """
         model_result = self._get_model_result(trial_id=trial_id)
         model_config = model_result.metrics["config"]["candidates"]
-        if model_config["trainer_type"] == "PromptTrainer":
-            raise NotImplementedError(
-                "'PromptTrainer' models do not support 'evaluate' yet because dygraph save model has not been implemented."
-            )
-        model_config["TrainingArguments.model_name_or_path"] = os.path.join(model_result.log_dir, self.export_path)
-        trainer = self._construct_trainer(model_config)
-        if eval_dataset is not None:
-            trans_func = functools.partial(
-                self._preprocess_fn,
-                tokenizer=trainer.tokenizer,
-                max_length=trainer.model.config.max_position_embeddings,  # truncate to the max length allowed by the model
-            )
-            processed_eval_dataset = copy.deepcopy(eval_dataset).map(trans_func, lazy=False)
-            eval_metrics = trainer.evaluate(processed_eval_dataset)
-        else:
-            eval_metrics = trainer.evaluate()
+
+        trainer = self._construct_trainer(model_config, eval_dataset)
+        trainer.load_state_dict_from_checkpoint(
+            resume_from_checkpoint=os.path.join(model_result.log_dir, self.save_path)
+        )
+
+        eval_metrics = trainer.evaluate()
         return eval_metrics
 
     def _compute_metrics(self, eval_preds: EvalPrediction) -> Dict[str, float]:
@@ -386,7 +387,7 @@ def _compute_multi_class_metrics(self, eval_preds: EvalPrediction) -> Dict[str,
 
     def _compute_multi_label_metrics(self, eval_preds: EvalPrediction) -> Dict[str, float]:
         pred_probs = sigmoid(eval_preds.predictions)
-        pred_ids = pred_probs > 0.5
+        pred_ids = pred_probs > self.multilabel_threshold
         metrics = {}
         # In multilabel classification, this function computes subset accuracy:
         # the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
@@ -425,23 +426,84 @@ def _preprocess_fn(
             result["labels"] = example_with_labels["labels"]
         return result
 
-    def to_taskflow(self, trial_id=None):
+    def to_taskflow(self, trial_id=None, export_path=None, batch_size=1, max_length=512, precision="fp32"):
         """
         Convert the model from a certain `trial_id` to a Taskflow for model inference
 
         Args:
-            trial_id (int, required): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
+            trial_id (int): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
+            export_path (str): the filepath to export to
+            max_length (int): Maximum number of tokens for the model. Defaults to 512.
+            batch_size(int): The sample number of a mini-batch. Defaults to 1.
+            precision (str): Select among ["fp32", "fp16"]. Default to "fp32".
         """
         model_result = self._get_model_result(trial_id=trial_id)
         model_config = model_result.metrics["config"]["candidates"]
+        trial_id = model_result.metrics["trial_id"]
+
+        if export_path is None:
+            export_path = os.path.join(self.export_path, trial_id)
+
+        self.export(export_path=export_path, trial_id=trial_id)
+
         if model_config["trainer_type"] == "PromptTrainer":
-            raise NotImplementedError("'Taskflow' inference does not support models trained with PromptTrainer yet.")
+            mode = "prompt"
         else:
-            exported_model_path = os.path.join(model_result.log_dir, self.export_path)
-            return Taskflow(
-                "text_classification",
-                mode="finetune",
-                problem_type=self.problem_type,
-                task_path=exported_model_path,
-                id2label=self.id2label,
+            mode = "finetune"
+
+        return Taskflow(
+            "text_classification",
+            mode=mode,
+            is_static_model=True,
+            problem_type=self.problem_type,
+            task_path=export_path,
+            multilabel_threshold=self.multilabel_threshold,
+            batch_size=batch_size,
+            max_length=max_length,
+            precision=precision,
+        )
+
+    def export(self, export_path, trial_id=None):
+        """
+        Export the model from a certain `trial_id` to the given file path.
+
+        Args:
+            export_path (str, required): the filepath to export to
+            trial_id (int, required): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
+        """
+
+        model_result = self._get_model_result(trial_id=trial_id)
+        model_config = model_result.metrics["config"]["candidates"]
+        trial_id = model_result.metrics["trial_id"]
+
+        if os.path.exists(export_path):
+            logger.info(
+                f"Export path for {trial_id} already exists: ({export_path}). The model parameter files will be overwritten."
             )
+
+        # construct trainer
+        trainer = self._construct_trainer(model_config)
+        trainer.load_state_dict_from_checkpoint(
+            resume_from_checkpoint=os.path.join(model_result.log_dir, self.save_path)
+        )
+
+        # save static model
+        if model_config["trainer_type"] == "PromptTrainer":
+            trainer.export_model(export_path)
+            trainer.model.plm.save_pretrained(os.path.join(export_path, "plm"))
+        else:
+            if trainer.model.init_config["init_class"] in ["ErnieMForSequenceClassification"]:
+                input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")]
+            else:
+                input_spec = [
+                    paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+                    paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+                ]
+            export_model(model=trainer.model, input_spec=input_spec, path=export_path)
+        # save tokenizer
+        trainer.tokenizer.save_pretrained(export_path)
+
+        # save id2label
+        shutil.copyfile(os.path.join(self.output_dir, "id2label.json"), os.path.join(export_path, "id2label.json"))
+
+        logger.info(f"Exported {trial_id} to {export_path}")
diff --git a/paddlenlp/taskflow/text_classification.py b/paddlenlp/taskflow/text_classification.py
@@ -266,7 +266,11 @@ def _preprocess(self, inputs: Union[str, List[str]]) -> Dict[str, Any]:
             collator = PromptDataCollatorWithPadding(
                 self._tokenizer, padding=True, return_tensors="np", return_attention_mask=True
             )
-            template_inputs = [self._template({"text_a": x}) for x in inputs]
+            part_text = "text"
+            for part in self._template.prompt:
+                if "text" in part:
+                    part_text = part["text"]
+            template_inputs = [self._template({part_text: x}) for x in inputs]
             batches = [template_inputs[idx : idx + batch_size] for idx in range(0, len(template_inputs), batch_size)]
         else:
             raise NotImplementedError(
diff --git a/tests/experimental/autonlp/test_text_classification.py b/tests/experimental/autonlp/test_text_classification.py