Skip to content

Commit 846eb3a

Browse files
authored
Add BERT, DistilBERT, and RoBERTa (#269)
Resolves #138, #152, and #229.
1 parent 1fb85a1 commit 846eb3a

17 files changed

+1121
-4
lines changed
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
{
2+
"model_type": "bert",
3+
"architectures": [
4+
"BertForMaskedLM"
5+
],
6+
"pre_weights": [
7+
{
8+
"name": "bert.embeddings.position_embeddings.weight"
9+
},
10+
{
11+
"name": "bert.embeddings.token_type_embeddings.weight"
12+
},
13+
{
14+
"name": "bert.embeddings.word_embeddings.weight",
15+
"is_embed": true
16+
},
17+
{
18+
"name": "bert.embeddings.LayerNorm.bias",
19+
"aliases": [
20+
"bert.embeddings.LayerNorm.beta"
21+
]
22+
},
23+
{
24+
"name": "bert.embeddings.LayerNorm.weight",
25+
"aliases": [
26+
"bert.embeddings.LayerNorm.gamma"
27+
]
28+
},
29+
{
30+
"name": "bert.embeddings.position_ids",
31+
"optional": true,
32+
"force_dtype": "int64"
33+
}
34+
],
35+
"post_weights": [
36+
{
37+
"name": "bert.pooler.dense.weight"
38+
},
39+
{
40+
"name": "bert.pooler.dense.bias"
41+
},
42+
{
43+
"name": "cls.predictions.bias"
44+
},
45+
{
46+
"name": "cls.predictions.decoder.weight",
47+
"aliases": [
48+
"bert.embeddings.word_embeddings.weight"
49+
],
50+
"is_embed": true
51+
}
52+
],
53+
"num_layers_config_key": "num_hidden_layers",
54+
"layer_templates": {
55+
"weights": [
56+
{
57+
"name": "bert.encoder.layer.${layer_index}.attention.self.query.weight"
58+
},
59+
{
60+
"name": "bert.encoder.layer.${layer_index}.attention.self.query.bias"
61+
},
62+
{
63+
"name": "bert.encoder.layer.${layer_index}.attention.self.key.weight"
64+
},
65+
{
66+
"name": "bert.encoder.layer.${layer_index}.attention.self.key.bias"
67+
},
68+
{
69+
"name": "bert.encoder.layer.${layer_index}.attention.self.value.weight"
70+
},
71+
{
72+
"name": "bert.encoder.layer.${layer_index}.attention.self.value.bias"
73+
},
74+
{
75+
"name": "bert.encoder.layer.${layer_index}.attention.output.dense.weight"
76+
},
77+
{
78+
"name": "bert.encoder.layer.${layer_index}.attention.output.dense.bias"
79+
},
80+
{
81+
"name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.bias",
82+
"aliases": [
83+
"bert.encoder.layer.${layer_index}.attention.output.LayerNorm.beta"
84+
]
85+
},
86+
{
87+
"name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.weight",
88+
"aliases": [
89+
"bert.encoder.layer.${layer_index}.attention.output.LayerNorm.gamma"
90+
]
91+
},
92+
{
93+
"name": "bert.encoder.layer.${layer_index}.intermediate.dense.weight"
94+
},
95+
{
96+
"name": "bert.encoder.layer.${layer_index}.intermediate.dense.bias"
97+
},
98+
{
99+
"name": "bert.encoder.layer.${layer_index}.output.dense.weight"
100+
},
101+
{
102+
"name": "bert.encoder.layer.${layer_index}.output.dense.bias"
103+
},
104+
{
105+
"name": "bert.encoder.layer.${layer_index}.output.LayerNorm.bias",
106+
"aliases": [
107+
"bert.encoder.layer.${layer_index}.output.LayerNorm.beta"
108+
]
109+
},
110+
{
111+
"name": "bert.encoder.layer.${layer_index}.output.LayerNorm.weight",
112+
"aliases": [
113+
"bert.encoder.layer.${layer_index}.output.LayerNorm.gamma"
114+
]
115+
}
116+
]
117+
}
118+
}
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
{
2+
"model_type": "bert",
3+
"architectures": [
4+
"BertForSequenceClassification",
5+
"BertForMultipleChoice",
6+
"BertForTokenClassification"
7+
],
8+
"pre_weights": [
9+
{
10+
"name": "bert.embeddings.position_embeddings.weight"
11+
},
12+
{
13+
"name": "bert.embeddings.token_type_embeddings.weight"
14+
},
15+
{
16+
"name": "bert.embeddings.word_embeddings.weight",
17+
"is_embed": true
18+
},
19+
{
20+
"name": "bert.embeddings.LayerNorm.bias",
21+
"aliases": [
22+
"bert.embeddings.LayerNorm.beta"
23+
]
24+
},
25+
{
26+
"name": "bert.embeddings.LayerNorm.weight",
27+
"aliases": [
28+
"bert.embeddings.LayerNorm.gamma"
29+
]
30+
},
31+
{
32+
"name": "bert.embeddings.position_ids",
33+
"optional": true,
34+
"force_dtype": "int64"
35+
}
36+
],
37+
"post_weights": [
38+
{
39+
"name": "bert.pooler.dense.weight",
40+
"optional": true
41+
},
42+
{
43+
"name": "bert.pooler.dense.bias",
44+
"optional": true
45+
},
46+
{
47+
"name": "classifier.bias"
48+
},
49+
{
50+
"name": "classifier.weight"
51+
}
52+
],
53+
"num_layers_config_key": "num_hidden_layers",
54+
"layer_templates": {
55+
"weights": [
56+
{
57+
"name": "bert.encoder.layer.${layer_index}.attention.self.query.weight"
58+
},
59+
{
60+
"name": "bert.encoder.layer.${layer_index}.attention.self.query.bias"
61+
},
62+
{
63+
"name": "bert.encoder.layer.${layer_index}.attention.self.key.weight"
64+
},
65+
{
66+
"name": "bert.encoder.layer.${layer_index}.attention.self.key.bias"
67+
},
68+
{
69+
"name": "bert.encoder.layer.${layer_index}.attention.self.value.weight"
70+
},
71+
{
72+
"name": "bert.encoder.layer.${layer_index}.attention.self.value.bias"
73+
},
74+
{
75+
"name": "bert.encoder.layer.${layer_index}.attention.output.dense.weight"
76+
},
77+
{
78+
"name": "bert.encoder.layer.${layer_index}.attention.output.dense.bias"
79+
},
80+
{
81+
"name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.bias",
82+
"aliases": [
83+
"bert.encoder.layer.${layer_index}.attention.output.LayerNorm.beta"
84+
]
85+
},
86+
{
87+
"name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.weight",
88+
"aliases": [
89+
"bert.encoder.layer.${layer_index}.attention.output.LayerNorm.gamma"
90+
]
91+
},
92+
{
93+
"name": "bert.encoder.layer.${layer_index}.intermediate.dense.weight"
94+
},
95+
{
96+
"name": "bert.encoder.layer.${layer_index}.intermediate.dense.bias"
97+
},
98+
{
99+
"name": "bert.encoder.layer.${layer_index}.output.dense.weight"
100+
},
101+
{
102+
"name": "bert.encoder.layer.${layer_index}.output.dense.bias"
103+
},
104+
{
105+
"name": "bert.encoder.layer.${layer_index}.output.LayerNorm.bias",
106+
"aliases": [
107+
"bert.encoder.layer.${layer_index}.output.LayerNorm.beta"
108+
]
109+
},
110+
{
111+
"name": "bert.encoder.layer.${layer_index}.output.LayerNorm.weight",
112+
"aliases": [
113+
"bert.encoder.layer.${layer_index}.output.LayerNorm.gamma"
114+
]
115+
}
116+
]
117+
}
118+
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
{
2+
"model_type": "bert",
3+
"architectures": [
4+
"BertModel"
5+
],
6+
"pre_weights": [
7+
{
8+
"name": "bert.embeddings.position_embeddings.weight"
9+
},
10+
{
11+
"name": "bert.embeddings.token_type_embeddings.weight"
12+
},
13+
{
14+
"name": "bert.embeddings.word_embeddings.weight",
15+
"is_embed": true
16+
},
17+
{
18+
"name": "bert.embeddings.LayerNorm.bias",
19+
"aliases": [
20+
"bert.embeddings.LayerNorm.beta"
21+
]
22+
},
23+
{
24+
"name": "bert.embeddings.LayerNorm.weight",
25+
"aliases": [
26+
"bert.embeddings.LayerNorm.gamma"
27+
]
28+
},
29+
{
30+
"name": "bert.embeddings.position_ids",
31+
"optional": true,
32+
"force_dtype": "int64"
33+
}
34+
],
35+
"post_weights": [
36+
{
37+
"name": "pooler.dense.weight"
38+
},
39+
{
40+
"name": "pooler.dense.bias"
41+
}
42+
],
43+
"num_layers_config_key": "num_hidden_layers",
44+
"layer_templates": {
45+
"weights": [
46+
{
47+
"name": "bert.encoder.layer.${layer_index}.attention.self.query.weight"
48+
},
49+
{
50+
"name": "bert.encoder.layer.${layer_index}.attention.self.query.bias"
51+
},
52+
{
53+
"name": "bert.encoder.layer.${layer_index}.attention.self.key.weight"
54+
},
55+
{
56+
"name": "bert.encoder.layer.${layer_index}.attention.self.key.bias"
57+
},
58+
{
59+
"name": "bert.encoder.layer.${layer_index}.attention.self.value.weight"
60+
},
61+
{
62+
"name": "bert.encoder.layer.${layer_index}.attention.self.value.bias"
63+
},
64+
{
65+
"name": "bert.encoder.layer.${layer_index}.attention.output.dense.weight"
66+
},
67+
{
68+
"name": "bert.encoder.layer.${layer_index}.attention.output.dense.bias"
69+
},
70+
{
71+
"name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.bias",
72+
"aliases": [
73+
"bert.encoder.layer.${layer_index}.attention.output.LayerNorm.beta"
74+
]
75+
},
76+
{
77+
"name": "bert.encoder.layer.${layer_index}.attention.output.LayerNorm.weight",
78+
"aliases": [
79+
"bert.encoder.layer.${layer_index}.attention.output.LayerNorm.gamma"
80+
]
81+
},
82+
{
83+
"name": "bert.encoder.layer.${layer_index}.intermediate.dense.weight"
84+
},
85+
{
86+
"name": "bert.encoder.layer.${layer_index}.intermediate.dense.bias"
87+
},
88+
{
89+
"name": "bert.encoder.layer.${layer_index}.output.dense.weight"
90+
},
91+
{
92+
"name": "bert.encoder.layer.${layer_index}.output.dense.bias"
93+
},
94+
{
95+
"name": "bert.encoder.layer.${layer_index}.output.LayerNorm.bias",
96+
"aliases": [
97+
"bert.encoder.layer.${layer_index}.output.LayerNorm.beta"
98+
]
99+
},
100+
{
101+
"name": "bert.encoder.layer.${layer_index}.output.LayerNorm.weight",
102+
"aliases": [
103+
"bert.encoder.layer.${layer_index}.output.LayerNorm.gamma"
104+
]
105+
}
106+
]
107+
}
108+
}

0 commit comments

Comments
 (0)