Skip to content

Commit e4793b8

Browse files
committed
Fix Dropout model regression issue
1 parent d2fa9fd commit e4793b8

File tree

8 files changed

+19
-22
lines changed

8 files changed

+19
-22
lines changed

DeBERTa/apps/multi_choice.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
2727
self.num_labels = num_labels
2828
self.classifier = nn.Linear(config.hidden_size, 1)
2929
drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
30-
self.dropout = StableDropout(drop_out)
30+
self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
3131
self.apply(self.init_weights)
3232

3333
def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):

DeBERTa/apps/ner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
2727
self.proj = nn.Linear(config.hidden_size, config.hidden_size)
2828
self.classifier = nn.Linear(config.hidden_size, self.num_labels)
2929
drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
30-
self.dropout = StableDropout(drop_out)
30+
self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
3131
self.apply(self.init_weights)
3232

3333
def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):

DeBERTa/apps/sequence_classification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(self, config, num_labels=2, drop_out=None, pre_trained=None):
3535

3636
self.classifier = torch.nn.Linear(output_dim, num_labels)
3737
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
38-
self.dropout = StableDropout(drop_out)
38+
self.dropout = StableDropout(drop_out) if config.use_xdropout else torch.nn.Dropout(drop_out)
3939
self.apply(self.init_weights)
4040
self.bert.apply_state()
4141

DeBERTa/deberta/bert.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __init__(self, config):
6363
super().__init__()
6464
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
6565
self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
66-
self.dropout = StableDropout(config.hidden_dropout_prob)
66+
self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
6767
self.config = config
6868

6969
def forward(self, hidden_states, input_states, mask=None):
@@ -110,7 +110,7 @@ def __init__(self, config):
110110
super(BertOutput, self).__init__()
111111
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
112112
self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
113-
self.dropout = StableDropout(config.hidden_dropout_prob)
113+
self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
114114
self.config = config
115115

116116
def forward(self, hidden_states, input_states, mask=None):
@@ -229,7 +229,7 @@ def __init__(self, config):
229229
if self.embedding_size != config.hidden_size:
230230
self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
231231
self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
232-
self.dropout = StableDropout(config.hidden_dropout_prob)
232+
self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
233233
self.output_to_half = False
234234
self.config = config
235235

DeBERTa/deberta/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ def from_dict(cls, json_object):
1515
if isinstance(value, dict):
1616
value = AbsModelConfig.from_dict(value)
1717
config.__dict__[key] = value
18+
config.use_xdropout = True
19+
config.use_xsoftmax = True
1820
return config
1921

2022
@classmethod

DeBERTa/deberta/disentangled_attention.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ def __init__(self, config):
7777
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
7878
self.all_head_size = self.num_attention_heads * self.attention_head_size
7979
self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size*3, bias=False)
80-
# Looks like params below are never updated and const, so removing them
81-
#self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
82-
#self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
80+
# ONNX graph builder thinks params below are not used for loss calcualtion
81+
self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
82+
self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
8383
self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'none').lower().split('|')] # c2p|p2c
8484

8585
self.relative_attention = getattr(config, 'relative_attention', False)
@@ -93,14 +93,14 @@ def __init__(self, config):
9393
self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
9494
if self.max_relative_positions <1:
9595
self.max_relative_positions = config.max_position_embeddings
96-
self.pos_dropout = StableDropout(config.hidden_dropout_prob)
96+
self.pos_dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.hidden_dropout_prob)
9797

9898
if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
9999
self.pos_proj = torch.nn.Linear(config.hidden_size, self.all_head_size, bias=False)
100100
if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
101101
self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size)
102102

103-
self.dropout = StableDropout(config.attention_probs_dropout_prob)
103+
self.dropout = StableDropout(config.attention_probs_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.attention_probs_dropout_prob)
104104

105105
def transpose_for_scores(self, x):
106106
new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
@@ -149,10 +149,8 @@ def linear(w,b,x):
149149
k,v = [linear(qkvw[i], qkvb[i], hidden_states) for i in range(1,3)]
150150
query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q,k,v]]
151151

152-
q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
153-
v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
154-
query_layer += self.transpose_for_scores(q_bias.unsqueeze(0).unsqueeze(0))
155-
value_layer += self.transpose_for_scores(v_bias.unsqueeze(0).unsqueeze(0))
152+
query_layer += self.transpose_for_scores(self.q_bias.unsqueeze(0).unsqueeze(0))
153+
value_layer += self.transpose_for_scores(self.v_bias.unsqueeze(0).unsqueeze(0))
156154

157155
rel_att = None
158156
# Take the dot product between "query" and "key" to get the raw attention scores.

DeBERTa/deberta/ops.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# Author: penhe@microsoft.com
77
# Date: 01/15/2020
88
#
9-
109
import math
1110
from packaging import version
1211
import torch
@@ -115,11 +114,7 @@ def backward(ctx, grad_output):
115114
else:
116115
return grad_output, None
117116

118-
class StableDropout(torch.nn.Dropout):
119-
def __init__(self, drop_prob):
120-
super().__init__()
121-
122-
class StableDropout1(torch.nn.Module):
117+
class StableDropout(torch.nn.Module):
123118
""" Optimized dropout module for stabilizing the training
124119
125120
Args:

DeBERTa/deberta/pooling.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ def __init__(self, config=None):
5858
self.hidden_size = 768
5959
self.dropout = 0
6060
self.hidden_act = 'gelu'
61+
self.use_xdropout = True
62+
self.use_xsoftmax = True
6163
if config:
6264
pool_config = getattr(config, 'pooling', config)
6365
if isinstance(pool_config, dict):
@@ -70,7 +72,7 @@ class ContextPooler(nn.Module):
7072
def __init__(self, config):
7173
super().__init__()
7274
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
73-
self.dropout = StableDropout(config.dropout)
75+
self.dropout = StableDropout(config.dropout) if config.use_xdropout else nn.Dropout(config.dropout)
7476
self.config = config
7577

7678
def forward(self, hidden_states, mask = None):

0 commit comments

Comments
 (0)