Fix Dropout model regression issue

ganik · ganik · commit e4793b87a7cb · 2020-08-14T19:28:24.000Z
diff --git a/DeBERTa/apps/multi_choice.py b/DeBERTa/apps/multi_choice.py
@@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
     self.num_labels = num_labels
     self.classifier = nn.Linear(config.hidden_size, 1)
     drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
-    self.dropout = StableDropout(drop_out)
+    self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
     self.apply(self.init_weights)
 
   def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):
diff --git a/DeBERTa/apps/ner.py b/DeBERTa/apps/ner.py
@@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
     self.proj = nn.Linear(config.hidden_size, config.hidden_size)
     self.classifier = nn.Linear(config.hidden_size, self.num_labels)
     drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
-    self.dropout = StableDropout(drop_out)
+    self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
     self.apply(self.init_weights)
 
   def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):
diff --git a/DeBERTa/apps/sequence_classification.py b/DeBERTa/apps/sequence_classification.py
@@ -35,7 +35,7 @@ def __init__(self, config, num_labels=2, drop_out=None, pre_trained=None):
 
     self.classifier = torch.nn.Linear(output_dim, num_labels)
     drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
-    self.dropout = StableDropout(drop_out)
+    self.dropout = StableDropout(drop_out) if config.use_xdropout else  torch.nn.Dropout(drop_out)
     self.apply(self.init_weights)
     self.bert.apply_state()
 
diff --git a/DeBERTa/deberta/bert.py b/DeBERTa/deberta/bert.py
@@ -63,7 +63,7 @@ def __init__(self, config):
     super().__init__()
     self.dense = nn.Linear(config.hidden_size, config.hidden_size)
     self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
     self.config = config
 
   def forward(self, hidden_states, input_states, mask=None):
@@ -110,7 +110,7 @@ def __init__(self, config):
     super(BertOutput, self).__init__()
     self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
     self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
     self.config = config
 
   def forward(self, hidden_states, input_states, mask=None):
@@ -229,7 +229,7 @@ def __init__(self, config):
     if self.embedding_size != config.hidden_size:
       self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
     self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
     self.output_to_half = False
     self.config = config
 
diff --git a/DeBERTa/deberta/config.py b/DeBERTa/deberta/config.py
@@ -15,6 +15,8 @@ def from_dict(cls, json_object):
             if isinstance(value, dict):
                 value = AbsModelConfig.from_dict(value)
             config.__dict__[key] = value
+        config.use_xdropout = True
+        config.use_xsoftmax = True
         return config
 
     @classmethod
diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py
@@ -77,9 +77,9 @@ def __init__(self, config):
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
         self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size*3, bias=False)
-        # Looks like params below are never updated and const, so removing them
-        #self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
-        #self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+        # ONNX graph builder thinks params below are not used for loss calcualtion
+        self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+        self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
         self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'none').lower().split('|')] # c2p|p2c
         
         self.relative_attention = getattr(config, 'relative_attention', False)
@@ -93,14 +93,14 @@ def __init__(self, config):
             self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
             if self.max_relative_positions <1:
                 self.max_relative_positions = config.max_position_embeddings
-            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.hidden_dropout_prob)
 
             if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
                 self.pos_proj = torch.nn.Linear(config.hidden_size, self.all_head_size, bias=False)
             if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
                 self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size)
 
-        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+        self.dropout = StableDropout(config.attention_probs_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.attention_probs_dropout_prob)
 
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
@@ -149,10 +149,8 @@ def linear(w,b,x):
             k,v = [linear(qkvw[i], qkvb[i], hidden_states) for i in range(1,3)]
             query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q,k,v]]
 
-        q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
-        v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
-        query_layer += self.transpose_for_scores(q_bias.unsqueeze(0).unsqueeze(0))
-        value_layer += self.transpose_for_scores(v_bias.unsqueeze(0).unsqueeze(0))
+        query_layer += self.transpose_for_scores(self.q_bias.unsqueeze(0).unsqueeze(0))
+        value_layer += self.transpose_for_scores(self.v_bias.unsqueeze(0).unsqueeze(0))
 
         rel_att = None
         # Take the dot product between "query" and "key" to get the raw attention scores.
diff --git a/DeBERTa/deberta/ops.py b/DeBERTa/deberta/ops.py
@@ -6,7 +6,6 @@
 # Author: penhe@microsoft.com
 # Date: 01/15/2020
 #
-
 import math
 from packaging import version
 import torch
@@ -115,11 +114,7 @@ def backward(ctx, grad_output):
     else:
       return grad_output, None
 
-class StableDropout(torch.nn.Dropout):
-  def __init__(self, drop_prob):
-      super().__init__()
-
-class StableDropout1(torch.nn.Module):
+class StableDropout(torch.nn.Module):
   """ Optimized dropout module for stabilizing the training
 
   Args:
diff --git a/DeBERTa/deberta/pooling.py b/DeBERTa/deberta/pooling.py
@@ -58,6 +58,8 @@ def __init__(self, config=None):
         self.hidden_size = 768
         self.dropout = 0
         self.hidden_act = 'gelu'
+        self.use_xdropout = True
+        self.use_xsoftmax = True
         if config:
             pool_config = getattr(config, 'pooling', config)
             if isinstance(pool_config, dict):
@@ -70,7 +72,7 @@ class ContextPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = StableDropout(config.dropout)
+        self.dropout = StableDropout(config.dropout) if config.use_xdropout else nn.Dropout(config.dropout) 
         self.config = config
 
     def forward(self, hidden_states, mask = None):