[speech-cmd] use uie in transformers (#4278)

LemonNoel · web-flow · commit 6f5c287e2160 · 2022-12-29T12:05:59.000+08:00
diff --git a/applications/speech_cmd_analysis/finetune.py b/applications/speech_cmd_analysis/finetune.py
@@ -13,18 +13,16 @@
 # limitations under the License.
 
 import argparse
-import time
 import os
+import time
 from functools import partial
 
 import paddle
-from paddle.utils.download import get_path_from_url
+from utils import convert_example, create_dataloader, evaluate, reader, set_seed
+
 from paddlenlp.datasets import load_dataset
-from paddlenlp.transformers import AutoTokenizer
 from paddlenlp.metrics import SpanEvaluator
-
-from model import UIE
-from utils import set_seed, convert_example, reader, MODEL_MAP, evaluate, create_dataloader
+from paddlenlp.transformers import UIE, AutoTokenizer
 
 
 def do_train():
@@ -35,15 +33,7 @@ def do_train():
 
     set_seed(args.seed)
 
-    encoding_model = MODEL_MAP[args.model]["encoding_model"]
-    resource_file_urls = MODEL_MAP[args.model]["resource_file_urls"]
-
-    for key, val in resource_file_urls.items():
-        file_path = os.path.join(args.model, key)
-        if not os.path.exists(file_path):
-            get_path_from_url(val, args.model)
-
-    tokenizer = AutoTokenizer.from_pretrained(encoding_model)
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
     model = UIE.from_pretrained(args.model)
 
     if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
@@ -71,7 +61,6 @@ def do_train():
 
     loss_list = []
     global_step = 0
-    best_step = 0
     best_f1 = 0
     tic_train = time.time()
     for epoch in range(1, args.num_epochs + 1):
@@ -123,8 +112,7 @@ def do_train():
     parser.add_argument("--train_path", default=None, type=str, help="The path of train set.")
     parser.add_argument("--dev_path", default=None, type=str, help="The path of dev set.")
     parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
-    parser.add_argument("--max_seq_len", default=512, type=int, help="The maximum input sequence length. "
-        "Sequences longer than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--max_seq_len", default=512, type=int, help="The maximum input sequence length. ")
     parser.add_argument("--num_epochs", default=100, type=int, help="Total number of training epochs to perform.")
     parser.add_argument("--seed", default=1000, type=int, help="Random seed for initialization")
     parser.add_argument("--logging_steps", default=10, type=int, help="The interval steps to logging.")
diff --git a/applications/speech_cmd_analysis/model.py b/applications/speech_cmd_analysis/model.py
diff --git a/applications/speech_cmd_analysis/utils.py b/applications/speech_cmd_analysis/utils.py
@@ -14,35 +14,16 @@
 # limitations under the License.
 
 import json
-import time
 import math
 import random
-import numpy as np
-from tqdm import tqdm
-
-from urllib.request import urlopen
-from urllib.request import Request
+import time
 from urllib.error import URLError
 from urllib.parse import urlencode
+from urllib.request import Request, urlopen
 
+import numpy as np
 import paddle
-
-MODEL_MAP = {
-    "uie-base": {
-        "encoding_model": "ernie-3.0-base-zh",
-        "resource_file_urls": {
-            "model_state.pdparams": "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_state.pdparams",
-            "model_config.json": "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/model_config.json",
-        },
-    },
-    "uie-tiny": {
-        "encoding_model": "ernie-3.0-medium-zh",
-        "resource_file_urls": {
-            "model_state.pdparams": "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/model_state.pdparams",
-            "model_config.json": "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_tiny/model_config.json",
-        },
-    },
-}
+from tqdm import tqdm
 
 
 def set_seed(seed):
@@ -83,12 +64,12 @@ def mandarin_asr_api(api_key, secret_key, audio_file, audio_format="wav"):
         result_str = urlopen(request).read()
     except URLError as error:
         print("token http response http code : " + str(error.code))
-        result_str = err.read()
+        result_str = error.read()
     result_str = result_str.decode()
 
     result = json.loads(result_str)
     if "access_token" in result.keys() and "scope" in result.keys():
-        if SCOPE and (not SCOPE in result["scope"].split(" ")):
+        if SCOPE and (SCOPE not in result["scope"].split(" ")):
             raise ASRError("scope is not correct!")
         token = result["access_token"]
     else:
@@ -319,7 +300,7 @@ def convert_ext_examples(raw_examples, negative_ratio):
     entity_name_set = []
     predicate_set = []
 
-    print(f"Converting doccano data...")
+    print("Converting doccano data...")
     with tqdm(total=len(raw_examples)) as pbar:
         for line in raw_examples:
             items = json.loads(line)
@@ -402,13 +383,13 @@ def convert_ext_examples(raw_examples, negative_ratio):
             relation_prompts.append(relation_prompt)
             pbar.update(1)
 
-    print(f"Adding negative samples for first stage prompt...")
+    print("Adding negative samples for first stage prompt...")
     entity_examples = add_negative_example(entity_examples, texts, entity_prompts, entity_label_set, negative_ratio)
     if len(predicate_set) != 0:
-        print(f"Constructing relation prompts...")
+        print("Constructing relation prompts...")
         relation_prompt_set = construct_relation_prompt_set(entity_name_set, predicate_set)
 
-        print(f"Adding negative samples for second stage prompt...")
+        print("Adding negative samples for second stage prompt...")
         relation_examples = add_negative_example(
             relation_examples, texts, relation_prompts, relation_prompt_set, negative_ratio
         )