Skip to content

Commit f152050

Browse files
author
gongenlei
authored
Fix mt5 config and add auto (#5129)
* fix mt5 config and add auto * update unitest
1 parent 5d053f6 commit f152050

File tree

4 files changed

+9
-97
lines changed

4 files changed

+9
-97
lines changed

paddlenlp/transformers/auto/modeling.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@
100100
("XLNet", "xlnet"),
101101
("XLM", "xlm"),
102102
("GPT", "gpt"),
103+
("MT5", "mt5"),
103104
("T5", "t5"),
104105
("Bert", "bert"),
105106
("Bart", "bart"),

paddlenlp/transformers/mt5/configuration.py

Lines changed: 2 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
1415
""" mT5 model configuration"""
1516
from __future__ import annotations
1617

@@ -20,98 +21,7 @@
2021

2122
__all__ = ["MT5_PRETRAINED_INIT_CONFIGURATION", "MT5Config"]
2223

23-
MT5_PRETRAINED_INIT_CONFIGURATION = {
24-
"mt5-small": {
25-
"tie_word_embeddings": False,
26-
"pad_token_id": 0,
27-
"bos_token_id": 0,
28-
"eos_token_id": 1,
29-
"vocab_size": 250112,
30-
"d_model": 512,
31-
"d_kv": 64,
32-
"d_ff": 1024,
33-
"num_layers": 8,
34-
"num_decoder_layers": 8,
35-
"num_heads": 6,
36-
"relative_attention_num_buckets": 32,
37-
"dropout_rate": 0.1,
38-
"layer_norm_epsilon": 1e-06,
39-
"initializer_factor": 1.0,
40-
"feed_forward_proj": "gated-gelu",
41-
},
42-
"mt5-base": {
43-
"tie_word_embeddings": False,
44-
"pad_token_id": 0,
45-
"bos_token_id": 0,
46-
"eos_token_id": 1,
47-
"vocab_size": 250112,
48-
"d_model": 768,
49-
"d_kv": 64,
50-
"d_ff": 2048,
51-
"num_layers": 12,
52-
"num_decoder_layers": 12,
53-
"num_heads": 12,
54-
"relative_attention_num_buckets": 32,
55-
"dropout_rate": 0.1,
56-
"layer_norm_epsilon": 1e-06,
57-
"initializer_factor": 1.0,
58-
"feed_forward_proj": "gated-gelu",
59-
},
60-
"mt5-large": {
61-
"tie_word_embeddings": False,
62-
"pad_token_id": 0,
63-
"bos_token_id": 0,
64-
"eos_token_id": 1,
65-
"vocab_size": 250112,
66-
"d_model": 1024,
67-
"d_kv": 64,
68-
"d_ff": 2816,
69-
"num_layers": 24,
70-
"num_decoder_layers": 24,
71-
"num_heads": 16,
72-
"relative_attention_num_buckets": 32,
73-
"dropout_rate": 0.1,
74-
"layer_norm_epsilon": 1e-06,
75-
"initializer_factor": 1.0,
76-
"feed_forward_proj": "gated-gelu",
77-
},
78-
"mt5-xl": {
79-
"tie_word_embeddings": False,
80-
"pad_token_id": 0,
81-
"bos_token_id": 0,
82-
"eos_token_id": 1,
83-
"vocab_size": 250112,
84-
"d_model": 2048,
85-
"d_kv": 64,
86-
"d_ff": 5120,
87-
"num_layers": 24,
88-
"num_decoder_layers": 24,
89-
"num_heads": 32,
90-
"relative_attention_num_buckets": 32,
91-
"dropout_rate": 0.1,
92-
"layer_norm_epsilon": 1e-06,
93-
"initializer_factor": 1.0,
94-
"feed_forward_proj": "gated-gelu",
95-
},
96-
"mt5-xxl": {
97-
"tie_word_embeddings": False,
98-
"pad_token_id": 0,
99-
"bos_token_id": 0,
100-
"eos_token_id": 1,
101-
"vocab_size": 250112,
102-
"d_model": 4096,
103-
"d_kv": 64,
104-
"d_ff": 10240,
105-
"num_layers": 24,
106-
"num_decoder_layers": 24,
107-
"num_heads": 64,
108-
"relative_attention_num_buckets": 32,
109-
"dropout_rate": 0.1,
110-
"layer_norm_epsilon": 1e-06,
111-
"initializer_factor": 1.0,
112-
"feed_forward_proj": "gated-gelu",
113-
},
114-
}
24+
MT5_PRETRAINED_INIT_CONFIGURATION = {}
11525

11626

11727
class MT5Config(PretrainedConfig):

paddlenlp/transformers/mt5/modeling.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@
4747
]
4848

4949
MT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
50-
"mt5-small",
51-
"mt5-base",
52-
"mt5-large",
53-
"mt5-xl",
54-
"mt5-xxl",
50+
"google/mt5-small",
51+
"google/mt5-base",
52+
"google/mt5-large",
53+
"google/mt5-xl",
54+
"google/mt5-xxl",
5555
]
5656

5757
DATA_TYPE_MAP = {

tests/transformers/mt5/test_modeling.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,7 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
489489
test_model_parallel = True
490490
use_test_inputs_embeds = True
491491
is_encoder_decoder = True
492+
use_test_model_name_list = False
492493
# The small MT5 model needs higher percentages for CPU/MP tests
493494
model_split_percents = [0.8, 0.9]
494495

0 commit comments

Comments
 (0)