Skip to content

Commit 1816c85

Browse files
qhpeklh5959wj-Mcat
andauthored
add ERNIE-CTM configuration class and move ERNIE-CTM modeling and rel… (#3945)
* add ERNIE-CTM configuration class and move ERNIE-CTM modeling and related tasks onto configuration. * fix some code-styl * fix some arguments * fix some bugs * fix term-linking priority bugs. * fix term-linking priority bugs. Co-authored-by: 骑马小猫 <[email protected]>
1 parent cc16ff2 commit 1816c85

File tree

3 files changed

+419
-197
lines changed

3 files changed

+419
-197
lines changed

paddlenlp/taskflow/knowledge_mining.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from ..datasets import MapDataset, load_dataset
4141
from ..data import Stack, Pad, Tuple
4242
from ..transformers import ErnieCtmWordtagModel, ErnieCtmNptagModel, ErnieCtmTokenizer
43+
from ..transformers.ernie_ctm.configuration import ErnieCtmConfig
4344
from .utils import download_file, add_docstrings, static_mode_guard, dygraph_mode_guard
4445
from .utils import TermTree, BurkhardKellerTree
4546
from .utils import Customization, WordTagRelationExtractor
@@ -395,6 +396,7 @@ def _term_linking(self, wordtag_res):
395396
continue
396397
else:
397398
target_type_can = LABEL_TO_SCHEMA[item["wordtag_label"]]
399+
high_priority = False
398400
for target_type_raw in target_type_can:
399401
target_type_ = target_type_raw.split("|")
400402
target_src = None
@@ -411,14 +413,17 @@ def _term_linking(self, wordtag_res):
411413
term_id = list(filter(lambda d: self._termtree[d].base.startswith(target_src.lower()), term_id))
412414
if len(term_id) == 0:
413415
continue
414-
term_id.sort(
415-
key=lambda d: (
416-
self._termtree[d].termtype == target_type or target_type in self._termtree[d].subtype,
417-
self._termtree[d].term == item["item"],
418-
),
419-
reverse=True,
420-
)
421-
item["termid"] = term_id[0]
416+
417+
term_id.sort(key=lambda d: (self._termtree[
418+
d].termtype == target_type or target_type in self._termtree[
419+
d].subtype, self._termtree[d].term == item["item"]),
420+
reverse=True)
421+
if self._termtree[term_id[0]].term == item["item"]:
422+
high_priority = True
423+
item["termid"] = term_id[0]
424+
if high_priority:
425+
break
426+
422427

423428
def _construct_input_spec(self):
424429
"""
@@ -434,7 +439,11 @@ def _construct_model(self, model):
434439
"""
435440
Construct the inference model for the predictor.
436441
"""
437-
model_instance = ErnieCtmWordtagModel.from_pretrained(self._task_path, num_tag=len(self._tags_to_index))
442+
443+
model_config = ErnieCtmConfig.from_pretrained(self._task_path, num_labels=len(self._tags_to_index))
444+
model_instance = ErnieCtmWordtagModel.from_pretrained(
445+
self._task_path, config=model_config)
446+
438447
if self._params_path is not None:
439448
state_dict = paddle.load(self._params_path)
440449
model_instance.set_dict(state_dict)
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
""" Ernie-CTM model configuration """
15+
from __future__ import annotations
16+
17+
from paddlenlp.transformers.configuration_utils import PretrainedConfig
18+
19+
ERNIE_CTM_CONFIG = {
20+
"vocab_size": 23000,
21+
"embedding_size": 128,
22+
"num_hidden_layers": 12,
23+
"num_attention_heads": 12,
24+
"intermediate_size": 3072,
25+
"hidden_dropout_prob": 0.1,
26+
"layer_norm_eps": 1e-12,
27+
"max_position_embeddings": 512,
28+
"type_vocab_size": 2,
29+
"initializer_range": 0.02,
30+
"pad_token_id": 0,
31+
"use_content_summary": True,
32+
"content_summary_index": 1,
33+
"cls_num": 2,
34+
"num_prompt_placeholders": 5,
35+
"prompt_vocab_ids": None
36+
}
37+
38+
39+
ERNIE_CTM_PRETRAINED_INIT_CONFIGURATION = {
40+
"ernie-ctm": ERNIE_CTM_CONFIG,
41+
"wordtag": ERNIE_CTM_CONFIG,
42+
"nptag": ERNIE_CTM_CONFIG
43+
}
44+
45+
ERNIE_CTM_PRETRAINED_RESOURCE_FILES_MAP = {
46+
"model_state": {
47+
"ernie-ctm":
48+
"https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm/ernie_ctm_base_pos.pdparams",
49+
"wordtag":
50+
"https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm/wordtag_pos.pdparams",
51+
"nptag":
52+
"https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm/nptag.pdparams",
53+
}
54+
}
55+
56+
57+
class ErnieCtmConfig(PretrainedConfig):
58+
r"""
59+
This is the configuration class to store the configuration of a [`ErnieCtmModel`]. It is used to instantiate
60+
a Ernie-CTM model according to the specified arguments, defining the model architecture. Instantiating a
61+
configuration with the defaults will yield a similar configuration to that of the Ernie-CTM-base architecture.
62+
63+
Configure objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
64+
documents from [`PretrainedConfig`] for more informations.
65+
66+
67+
Args:
68+
vocab_size (`int`, *optional*, defaults to 23000):
69+
Vocabulary size of the Ernie-CTM model. Defines the number of different tokens that can be represented by
70+
the `input_ids` passed when calling [`ErnieCtmModel`].
71+
embedding_size (`int` *optional*, defaults to 128):
72+
Dimensionality of vocabulary embeddings.
73+
hidden_size (`int`, *optional*, defaults to 768):
74+
Dimensionality of the encoder layers and pooler layer.
75+
num_hidden_layers (`int`, *optional*, defaults to 12):
76+
Number of hidden layers in the Transformer encoder.
77+
num_attention_heads (`int`, *optional*, defaults to 12):
78+
Number of attention heads for each attention layer in the Transformer encoder.
79+
intermediate_size (`int`, *optional*, defaults to 3072):
80+
The dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
81+
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
82+
The dropout probability for all fully connected layers in the embeddings, encoder and pooler.
83+
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
84+
The dropout probability for the attention probabilities.
85+
max_position_embeddings (`int`, *optional*, defaults to 512):
86+
The maximum sequence length that this model might ever be used with. Typically set this to something large.
87+
type_vocab_size (`int`, *optional*, defaults to 2):
88+
The vocabulary size of the `token_type_ids` passed when call [`ErnieCtmModel`].
89+
initializer_range (`float`, *optional*, defaults to 0.02):
90+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
91+
use_content_summary (`bool`, *optional*, defaults to True):
92+
Whether to use content summary token and content representation when inputs passed into [`ErnieCtmModel`].
93+
content_summary_index (`int`, *optional*, defaults to 1):
94+
If `use_content_summary` is set, content summary token position is defined by this argument.
95+
cls_num (`int`, *optional*, defaults to 2):
96+
Number of [CLS] token in model.
97+
num_prompt_placeholders (`int`, *optional*, defaults to 5):
98+
Number of maximum length of prompt answer.
99+
prompt_vocab_ids (`dict`, *optional*, defaults to None):
100+
Prompt vocabulary of decode procedure.
101+
"""
102+
model_type = "ernie-ctm"
103+
pretrained_init_configuration = ERNIE_CTM_PRETRAINED_INIT_CONFIGURATION
104+
105+
def __init__(
106+
self,
107+
vocab_size: int = 23000,
108+
embedding_size: int = 128,
109+
hidden_size: int = 768,
110+
num_hidden_layers: int = 12,
111+
num_attention_heads: int = 12,
112+
intermediate_size: int = 3072,
113+
hidden_dropout_prob: float = 0.1,
114+
attention_probs_dropout_prob: float = 0.1,
115+
max_position_embeddings: int = 512,
116+
layer_norm_eps: float = 1e-12,
117+
type_vocab_size: int = 2,
118+
initializer_range: float = 0.02,
119+
use_content_summary: bool = True,
120+
content_summary_index: int = 1,
121+
cls_num: int = 2,
122+
pad_token_id: int = 0,
123+
num_prompt_placeholders: int = 5,
124+
prompt_vocab_ids: set = None,
125+
**kwargs
126+
):
127+
super(ErnieCtmConfig, self).__init__(pad_token_id=pad_token_id, **kwargs)
128+
self.vocab_size = vocab_size
129+
self.embedding_size = embedding_size
130+
self.hidden_size = hidden_size
131+
self.num_hidden_layers = num_hidden_layers
132+
self.num_attention_heads = num_attention_heads
133+
self.intermediate_size = intermediate_size
134+
self.hidden_dropout_prob = hidden_dropout_prob
135+
self.attention_probs_dropout_prob = attention_probs_dropout_prob
136+
self.max_position_embeddings = max_position_embeddings
137+
self.type_vocab_size = type_vocab_size
138+
self.layer_norm_eps = layer_norm_eps
139+
self.initializer_range = initializer_range
140+
self.use_content_summary = use_content_summary
141+
self.content_summary_index = content_summary_index
142+
self.cls_num = cls_num
143+
self.num_prompt_placeholders = num_prompt_placeholders
144+
self.prompt_vocab_ids = prompt_vocab_ids

0 commit comments

Comments
 (0)