|
11 | 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
| 14 | + |
14 | 15 | """ mT5 model configuration"""
|
15 | 16 | from __future__ import annotations
|
16 | 17 |
|
|
20 | 21 |
|
21 | 22 | __all__ = ["MT5_PRETRAINED_INIT_CONFIGURATION", "MT5Config"]
|
22 | 23 |
|
23 |
| -MT5_PRETRAINED_INIT_CONFIGURATION = { |
24 |
| - "mt5-small": { |
25 |
| - "tie_word_embeddings": False, |
26 |
| - "pad_token_id": 0, |
27 |
| - "bos_token_id": 0, |
28 |
| - "eos_token_id": 1, |
29 |
| - "vocab_size": 250112, |
30 |
| - "d_model": 512, |
31 |
| - "d_kv": 64, |
32 |
| - "d_ff": 1024, |
33 |
| - "num_layers": 8, |
34 |
| - "num_decoder_layers": 8, |
35 |
| - "num_heads": 6, |
36 |
| - "relative_attention_num_buckets": 32, |
37 |
| - "dropout_rate": 0.1, |
38 |
| - "layer_norm_epsilon": 1e-06, |
39 |
| - "initializer_factor": 1.0, |
40 |
| - "feed_forward_proj": "gated-gelu", |
41 |
| - }, |
42 |
| - "mt5-base": { |
43 |
| - "tie_word_embeddings": False, |
44 |
| - "pad_token_id": 0, |
45 |
| - "bos_token_id": 0, |
46 |
| - "eos_token_id": 1, |
47 |
| - "vocab_size": 250112, |
48 |
| - "d_model": 768, |
49 |
| - "d_kv": 64, |
50 |
| - "d_ff": 2048, |
51 |
| - "num_layers": 12, |
52 |
| - "num_decoder_layers": 12, |
53 |
| - "num_heads": 12, |
54 |
| - "relative_attention_num_buckets": 32, |
55 |
| - "dropout_rate": 0.1, |
56 |
| - "layer_norm_epsilon": 1e-06, |
57 |
| - "initializer_factor": 1.0, |
58 |
| - "feed_forward_proj": "gated-gelu", |
59 |
| - }, |
60 |
| - "mt5-large": { |
61 |
| - "tie_word_embeddings": False, |
62 |
| - "pad_token_id": 0, |
63 |
| - "bos_token_id": 0, |
64 |
| - "eos_token_id": 1, |
65 |
| - "vocab_size": 250112, |
66 |
| - "d_model": 1024, |
67 |
| - "d_kv": 64, |
68 |
| - "d_ff": 2816, |
69 |
| - "num_layers": 24, |
70 |
| - "num_decoder_layers": 24, |
71 |
| - "num_heads": 16, |
72 |
| - "relative_attention_num_buckets": 32, |
73 |
| - "dropout_rate": 0.1, |
74 |
| - "layer_norm_epsilon": 1e-06, |
75 |
| - "initializer_factor": 1.0, |
76 |
| - "feed_forward_proj": "gated-gelu", |
77 |
| - }, |
78 |
| - "mt5-xl": { |
79 |
| - "tie_word_embeddings": False, |
80 |
| - "pad_token_id": 0, |
81 |
| - "bos_token_id": 0, |
82 |
| - "eos_token_id": 1, |
83 |
| - "vocab_size": 250112, |
84 |
| - "d_model": 2048, |
85 |
| - "d_kv": 64, |
86 |
| - "d_ff": 5120, |
87 |
| - "num_layers": 24, |
88 |
| - "num_decoder_layers": 24, |
89 |
| - "num_heads": 32, |
90 |
| - "relative_attention_num_buckets": 32, |
91 |
| - "dropout_rate": 0.1, |
92 |
| - "layer_norm_epsilon": 1e-06, |
93 |
| - "initializer_factor": 1.0, |
94 |
| - "feed_forward_proj": "gated-gelu", |
95 |
| - }, |
96 |
| - "mt5-xxl": { |
97 |
| - "tie_word_embeddings": False, |
98 |
| - "pad_token_id": 0, |
99 |
| - "bos_token_id": 0, |
100 |
| - "eos_token_id": 1, |
101 |
| - "vocab_size": 250112, |
102 |
| - "d_model": 4096, |
103 |
| - "d_kv": 64, |
104 |
| - "d_ff": 10240, |
105 |
| - "num_layers": 24, |
106 |
| - "num_decoder_layers": 24, |
107 |
| - "num_heads": 64, |
108 |
| - "relative_attention_num_buckets": 32, |
109 |
| - "dropout_rate": 0.1, |
110 |
| - "layer_norm_epsilon": 1e-06, |
111 |
| - "initializer_factor": 1.0, |
112 |
| - "feed_forward_proj": "gated-gelu", |
113 |
| - }, |
114 |
| -} |
| 24 | +MT5_PRETRAINED_INIT_CONFIGURATION = {} |
115 | 25 |
|
116 | 26 |
|
117 | 27 | class MT5Config(PretrainedConfig):
|
|
0 commit comments