fix ie_utils import (#4066)

linjieccc · web-flow · commit cf523c7af265 · 2022-12-09T19:27:54.000+08:00
* fix import

* fix import

* fix import
diff --git a/applications/information_extraction/document/finetune.py b/applications/information_extraction/document/finetune.py
@@ -21,14 +21,14 @@
 from utils import convert_example, reader
 
 from paddlenlp.datasets import load_dataset
-from paddlenlp.metrics import SpanEvaluator
 from paddlenlp.trainer import (
     CompressionArguments,
     PdArgumentParser,
     Trainer,
     get_last_checkpoint,
 )
 from paddlenlp.transformers import UIEX, AutoTokenizer, export_model
+from paddlenlp.utils.ie_utils import compute_metrics, uie_loss_func
 from paddlenlp.utils.log import logger
 
 
@@ -113,31 +113,6 @@ def main():
     train_ds = train_ds.map(trans_fn)
     dev_ds = dev_ds.map(trans_fn)
 
-    criterion = paddle.nn.BCELoss()
-
-    def uie_loss_func(outputs, labels):
-        start_ids, end_ids = labels
-        start_prob, end_prob = outputs
-        start_ids = paddle.cast(start_ids, "float32")
-        end_ids = paddle.cast(end_ids, "float32")
-        loss_start = criterion(start_prob, start_ids)
-        loss_end = criterion(end_prob, end_ids)
-        loss = (loss_start + loss_end) / 2.0
-        return loss
-
-    def compute_metrics(p):
-        metric = SpanEvaluator()
-        start_prob, end_prob = p.predictions
-        start_ids, end_ids = p.label_ids
-        metric.reset()
-
-        num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, start_ids, end_ids)
-        metric.update(num_correct, num_infer, num_label)
-        precision, recall, f1 = metric.accumulate()
-        metric.reset()
-
-        return {"precision": precision, "recall": recall, "f1": f1}
-
     trainer = Trainer(
         model=model,
         criterion=uie_loss_func,
diff --git a/applications/information_extraction/text/finetune.py b/applications/information_extraction/text/finetune.py
@@ -31,6 +31,7 @@
     get_last_checkpoint,
 )
 from paddlenlp.transformers import UIE, UIEM, AutoTokenizer, export_model
+from paddlenlp.utils.ie_utils import compute_metrics, uie_loss_func
 from paddlenlp.utils.log import logger
 
 
@@ -141,31 +142,6 @@ def main():
 
     data_collator = DataCollatorWithPadding(tokenizer)
 
-    criterion = paddle.nn.BCELoss()
-
-    def uie_loss_func(outputs, labels):
-        start_ids, end_ids = labels
-        start_prob, end_prob = outputs
-        start_ids = paddle.cast(start_ids, "float32")
-        end_ids = paddle.cast(end_ids, "float32")
-        loss_start = criterion(start_prob, start_ids)
-        loss_end = criterion(end_prob, end_ids)
-        loss = (loss_start + loss_end) / 2.0
-        return loss
-
-    def compute_metrics(p):
-        metric = SpanEvaluator()
-        start_prob, end_prob = p.predictions
-        start_ids, end_ids = p.label_ids
-        metric.reset()
-
-        num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, start_ids, end_ids)
-        metric.update(num_correct, num_infer, num_label)
-        precision, recall, f1 = metric.accumulate()
-        metric.reset()
-
-        return {"precision": precision, "recall": recall, "f1": f1}
-
     trainer = Trainer(
         model=model,
         criterion=uie_loss_func,
diff --git a/paddlenlp/taskflow/task.py b/paddlenlp/taskflow/task.py
@@ -246,7 +246,7 @@ def _get_inference_model(self):
                 self._param_updated = True
                 if os.path.exists(cache_info_path) and open(cache_info_path).read()[:-8] == md5:
                     self._param_updated = False
-                elif self.task == "information_extraction":
+                elif self.task == "information_extraction" and self.model != "uie-data-distill-gp":
                     # UIE related models are moved to paddlenlp.transformers after v2.4.5
                     # So we convert the parameter key names for compatibility
                     # This check will be discard in future
@@ -257,7 +257,9 @@ def _get_inference_model(self):
                     prefix_map = {"UIE": "ernie", "UIEM": "ernie_m", "UIEX": "ernie_layout"}
                     new_state_dict = {}
                     for name, param in model_state.items():
-                        if "encoder.encoder" in name:
+                        if "ernie" in name:
+                            new_state_dict[name] = param
+                        elif "encoder.encoder" in name:
                             trans_name = name.replace("encoder.encoder", prefix_map[self._init_class] + ".encoder")
                             new_state_dict[trans_name] = param
                         elif "encoder" in name:
diff --git a/paddlenlp/utils/ie_utils.py b/paddlenlp/utils/ie_utils.py
@@ -16,8 +16,10 @@
 from io import BytesIO
 
 import numpy as np
+import paddle
 from PIL import Image
 
+from ..metrics import SpanEvaluator
 from .image_utils import NormalizeImage, Permute, ResizeImage
 
 resize_func = ResizeImage(target_size=224, interp=1)
@@ -112,3 +114,29 @@ def compare(a, b, schema_lang="ch"):
                     relation_type = prefix
                 relation_type_dict.setdefault(relation_type, []).append(relation_data[i][1])
     return relation_type_dict
+
+
+def uie_loss_func(outputs, labels):
+    criterion = paddle.nn.BCELoss()
+    start_ids, end_ids = labels
+    start_prob, end_prob = outputs
+    start_ids = paddle.cast(start_ids, "float32")
+    end_ids = paddle.cast(end_ids, "float32")
+    loss_start = criterion(start_prob, start_ids)
+    loss_end = criterion(end_prob, end_ids)
+    loss = (loss_start + loss_end) / 2.0
+    return loss
+
+
+def compute_metrics(p):
+    metric = SpanEvaluator()
+    start_prob, end_prob = p.predictions
+    start_ids, end_ids = p.label_ids
+    metric.reset()
+
+    num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, start_ids, end_ids)
+    metric.update(num_correct, num_infer, num_label)
+    precision, recall, f1 = metric.accumulate()
+    metric.reset()
+
+    return {"precision": precision, "recall": recall, "f1": f1}