Skip to content

Commit 813e5c9

Browse files
wj-McatguoshengCS
andauthored
[UnitTest] Add ernie gram unittest (#3059)
* add ernie gram unittest * update ernie-gram unittest * update local branch * update ernie-gram * update model_name Co-authored-by: Guo Sheng <[email protected]>
1 parent 22bd3aa commit 813e5c9

File tree

5 files changed

+506
-1
lines changed

5 files changed

+506
-1
lines changed

paddlenlp/transformers/ernie/tokenizer.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,14 @@ def get_special_tokens_mask(self,
624624
[0] * len(token_ids_1)) + [1]
625625
return [1] + ([0] * len(token_ids_0)) + [1]
626626

627+
def get_vocab(self):
628+
vocab = {
629+
self.convert_ids_to_tokens(i): i
630+
for i in range(self.vocab_size)
631+
}
632+
vocab.update(self.added_tokens_encoder)
633+
return vocab
634+
627635

628636
class ErnieTinyTokenizer(PretrainedTokenizer):
629637
r"""
@@ -984,3 +992,11 @@ def get_special_tokens_mask(self,
984992
return [1] + ([0] * len(token_ids_0)) + [1] + (
985993
[0] * len(token_ids_1)) + [1]
986994
return [1] + ([0] * len(token_ids_0)) + [1]
995+
996+
def get_vocab(self):
997+
vocab = {
998+
self.convert_ids_to_tokens(i): i
999+
for i in range(self.vocab_size)
1000+
}
1001+
vocab.update(self.added_tokens_encoder)
1002+
return vocab

paddlenlp/transformers/ernie_gram/tokenizer.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ class ErnieGramTokenizer(ErnieTokenizer):
8989
"do_lower_case": True
9090
},
9191
}
92+
max_model_input_sizes = {
93+
"ernie-gram-zh": 512,
94+
"ernie-gram-zh-finetuned-dureader-robust": 512,
95+
}
9296

9397
def __init__(self,
9498
vocab_file,
@@ -105,4 +109,5 @@ def __init__(self,
105109
sep_token=sep_token,
106110
pad_token=pad_token,
107111
cls_token=cls_token,
108-
mask_token=mask_token)
112+
mask_token=mask_token,
113+
**kwargs)

tests/transformers/ernie_gram/__init__.py

Whitespace-only changes.
Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
# Copyright 2020 The HuggingFace Team. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import unittest
17+
from typing import Optional, Tuple, Dict, Any
18+
import paddle
19+
from paddle import Tensor
20+
21+
from dataclasses import dataclass, asdict, fields, Field
22+
from paddlenlp.transformers import (ErnieGramModel, ErnieGramPretrainedModel,
23+
ErnieGramForSequenceClassification,
24+
ErnieGramForTokenClassification,
25+
ErnieGramForQuestionAnswering)
26+
27+
from ..test_modeling_common import (ids_tensor, floats_tensor,
28+
random_attention_mask, ModelTesterMixin)
29+
from ...testing_utils import slow
30+
31+
32+
@dataclass
33+
class ErnieGramTestModelConfig:
34+
"""ernie-gram model config which keep consist with pretrained_init_configuration sub fields
35+
"""
36+
attention_probs_dropout_prob: float = 0.1
37+
emb_size: int = 768
38+
hidden_act: str = "gelu"
39+
hidden_dropout_prob: float = 0.1
40+
hidden_size: int = 768
41+
initializer_range: float = 0.02
42+
max_position_embeddings: int = 512
43+
num_attention_heads: int = 12
44+
num_hidden_layers: int = 12
45+
type_vocab_size: int = 2
46+
vocab_size: int = 1801
47+
48+
@property
49+
def model_kwargs(self) -> dict:
50+
"""get the model kwargs configuration to init the model"""
51+
model_config_fields: Tuple[Field,
52+
...] = fields(ErnieGramTestModelConfig)
53+
return {
54+
field.name: getattr(self, field.name)
55+
for field in model_config_fields
56+
}
57+
58+
59+
@dataclass
60+
class ErnieGramTestConfig(ErnieGramTestModelConfig):
61+
"""all of ErnieGram Test configuration
62+
63+
"""
64+
batch_size: int = 2
65+
seq_length: int = 7
66+
67+
is_training: bool = False
68+
use_token_type_ids: bool = True
69+
use_attention_mask: bool = True
70+
71+
# used for sequence classification
72+
num_classes: int = 3
73+
74+
test_resize_embeddings: bool = False
75+
76+
77+
class ErnieGramModelTester:
78+
"""Base ErnieGram Model tester which can test:
79+
"""
80+
81+
def __init__(self, parent, config: Optional[ErnieGramTestConfig] = None):
82+
self.parent = parent
83+
self.config: ErnieGramTestConfig = config or ErnieGramTestConfig()
84+
85+
self.is_training = self.config.is_training
86+
87+
def prepare_config_and_inputs(
88+
self) -> Tuple[Dict[str, Any], Tensor, Tensor, Tensor]:
89+
config = self.config
90+
input_ids = ids_tensor([config.batch_size, config.seq_length],
91+
config.vocab_size)
92+
93+
attention_mask = None
94+
if config.use_attention_mask:
95+
attention_mask = random_attention_mask(
96+
[config.batch_size, config.seq_length])
97+
98+
token_type_ids = None
99+
if config.use_token_type_ids:
100+
token_type_ids = paddle.zeros_like(input_ids)
101+
102+
return config.model_kwargs, input_ids, token_type_ids, attention_mask
103+
104+
def prepare_config_and_inputs_for_common(self):
105+
config, input_ids, token_type_ids, attention_mask = self.prepare_config_and_inputs(
106+
)
107+
inputs_dict = {
108+
"input_ids": input_ids,
109+
"token_type_ids": token_type_ids,
110+
"attention_mask": attention_mask,
111+
}
112+
return config, inputs_dict
113+
114+
def create_and_check_model(self, config: Dict[str, Any], input_ids: Tensor,
115+
token_type_ids: Tensor, attention_mask: Tensor):
116+
model = ErnieGramModel(**config)
117+
model.eval()
118+
119+
result = model(input_ids,
120+
token_type_ids=token_type_ids,
121+
attention_mask=attention_mask)
122+
self.parent.assertEqual(result[0].shape, [
123+
self.config.batch_size, self.config.seq_length,
124+
self.config.hidden_size
125+
])
126+
self.parent.assertEqual(
127+
result[1].shape, [self.config.batch_size, self.config.hidden_size])
128+
129+
def create_and_check_for_sequence_classification(self, config,
130+
input_ids: Tensor,
131+
token_type_ids: Tensor,
132+
attention_mask: Tensor):
133+
model = ErnieGramForSequenceClassification(
134+
ErnieGramModel(**config), num_classes=self.config.num_classes)
135+
model.eval()
136+
result = model(input_ids,
137+
token_type_ids=token_type_ids,
138+
attention_mask=attention_mask)
139+
self.parent.assertEqual(
140+
result.shape, [self.config.batch_size, self.config.num_classes])
141+
142+
def create_and_check_for_question_answering(self, config, input_ids: Tensor,
143+
token_type_ids: Tensor,
144+
attention_mask: Tensor):
145+
model = ErnieGramForQuestionAnswering(ErnieGramModel(**config))
146+
model.eval()
147+
result = model(input_ids,
148+
token_type_ids=token_type_ids,
149+
attention_mask=attention_mask)
150+
self.parent.assertEqual(result.shape, [
151+
self.config.batch_size, self.config.seq_length,
152+
self.config.num_classes
153+
])
154+
155+
def create_and_check_for_token_classification(self, config,
156+
input_ids: Tensor,
157+
token_type_ids: Tensor,
158+
attention_mask: Tensor):
159+
model = ErnieGramForTokenClassification(
160+
ErnieGramModel(**config), num_classes=self.config.num_classes)
161+
model.eval()
162+
result = model(input_ids,
163+
token_type_ids=token_type_ids,
164+
attention_mask=attention_mask)
165+
self.parent.assertEqual(result.shape, [
166+
self.config.batch_size, self.config.seq_length,
167+
self.config.num_classes
168+
])
169+
170+
def get_config(self) -> dict:
171+
"""get the base model kwargs
172+
173+
Returns:
174+
dict: the values of kwargs
175+
"""
176+
return self.config.model_kwargs
177+
178+
179+
class ErnieGramModelTest(ModelTesterMixin, unittest.TestCase):
180+
base_model_class = ErnieGramModel
181+
182+
all_model_classes = (ErnieGramModel, ErnieGramForSequenceClassification,
183+
ErnieGramForTokenClassification,
184+
ErnieGramForQuestionAnswering)
185+
186+
def setUp(self):
187+
self.model_tester = ErnieGramModelTester(self)
188+
self.test_resize_embeddings = self.model_tester.config.test_resize_embeddings
189+
190+
def get_config():
191+
pass
192+
193+
def test_model(self):
194+
config_and_inputs = self.model_tester.prepare_config_and_inputs()
195+
self.model_tester.create_and_check_model(*config_and_inputs)
196+
197+
def test_for_sequence_classification(self):
198+
config_and_inputs = self.model_tester.prepare_config_and_inputs()
199+
self.model_tester.create_and_check_for_sequence_classification(
200+
*config_and_inputs)
201+
202+
def test_for_token_classification(self):
203+
config_and_inputs = self.model_tester.prepare_config_and_inputs()
204+
self.model_tester.create_and_check_for_token_classification(
205+
*config_and_inputs)
206+
207+
def test_for_question_answering(self):
208+
config_and_inputs = self.model_tester.prepare_config_and_inputs()
209+
self.model_tester.create_and_check_for_token_classification(
210+
*config_and_inputs)
211+
212+
@slow
213+
def test_model_from_pretrained(self):
214+
for model_name in list(
215+
ErnieGramPretrainedModel.pretrained_init_configuration)[:1]:
216+
model = ErnieGramModel.from_pretrained(model_name)
217+
self.assertIsNotNone(model)
218+
219+
220+
class ErnieGramModelIntegrationTest(unittest.TestCase):
221+
222+
@slow
223+
def test_inference_no_attention(self):
224+
model = ErnieGramModel.from_pretrained("ernie-gram-zh")
225+
model.eval()
226+
input_ids = paddle.to_tensor(
227+
[[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
228+
with paddle.no_grad():
229+
output = model(input_ids)[0]
230+
expected_shape = [1, 11, 768]
231+
self.assertEqual(output.shape, expected_shape)
232+
233+
expected_slice = paddle.to_tensor(
234+
[[[-0.43569842, -1.50805628, -2.24448967],
235+
[-0.12123521, -1.35024536, -1.76512492],
236+
[-0.14853711, -1.13618660, -2.87098265]]])
237+
self.assertTrue(
238+
paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-5))
239+
240+
@slow
241+
def test_inference_with_attention(self):
242+
model = ErnieGramModel.from_pretrained(
243+
"ernie-gram-zh-finetuned-dureader-robust")
244+
model.eval()
245+
input_ids = paddle.to_tensor(
246+
[[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
247+
attention_mask = paddle.to_tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
248+
with paddle.no_grad():
249+
output = model(input_ids, attention_mask=attention_mask)[0]
250+
expected_shape = [1, 11, 768]
251+
self.assertEqual(output.shape, expected_shape)
252+
253+
expected_slice = paddle.to_tensor(
254+
[[[0.37543082, -2.94639230, -2.04799986],
255+
[0.14168003, -2.02873731, -2.34919119],
256+
[0.70280838, -2.40280604, -1.93488157]]])
257+
self.assertTrue(
258+
paddle.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4))
259+
260+
261+
if __name__ == "__main__":
262+
unittest.main()

0 commit comments

Comments
 (0)