@@ -77,9 +77,9 @@ def __init__(self, config):
7777 self .attention_head_size = int (config .hidden_size / config .num_attention_heads )
7878 self .all_head_size = self .num_attention_heads * self .attention_head_size
7979 self .in_proj = torch .nn .Linear (config .hidden_size , self .all_head_size * 3 , bias = False )
80- # Looks like params below are never updated and const, so removing them
81- # self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
82- # self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
80+ # ONNX graph builder thinks params below are not used for loss calcualtion
81+ self .q_bias = torch .nn .Parameter (torch .zeros ((self .all_head_size ), dtype = torch .float ))
82+ self .v_bias = torch .nn .Parameter (torch .zeros ((self .all_head_size ), dtype = torch .float ))
8383 self .pos_att_type = [x .strip () for x in getattr (config , 'pos_att_type' , 'none' ).lower ().split ('|' )] # c2p|p2c
8484
8585 self .relative_attention = getattr (config , 'relative_attention' , False )
@@ -93,14 +93,14 @@ def __init__(self, config):
9393 self .max_relative_positions = getattr (config , 'max_relative_positions' , - 1 )
9494 if self .max_relative_positions < 1 :
9595 self .max_relative_positions = config .max_position_embeddings
96- self .pos_dropout = StableDropout (config .hidden_dropout_prob )
96+ self .pos_dropout = StableDropout (config .hidden_dropout_prob ) if config . use_xdropout else torch . nn . Dropout ( config . hidden_dropout_prob )
9797
9898 if 'c2p' in self .pos_att_type or 'p2p' in self .pos_att_type :
9999 self .pos_proj = torch .nn .Linear (config .hidden_size , self .all_head_size , bias = False )
100100 if 'p2c' in self .pos_att_type or 'p2p' in self .pos_att_type :
101101 self .pos_q_proj = torch .nn .Linear (config .hidden_size , self .all_head_size )
102102
103- self .dropout = StableDropout (config .attention_probs_dropout_prob )
103+ self .dropout = StableDropout (config .attention_probs_dropout_prob ) if config . use_xdropout else torch . nn . Dropout ( config . attention_probs_dropout_prob )
104104
105105 def transpose_for_scores (self , x ):
106106 new_x_shape = x .size ()[:- 1 ] + (self .num_attention_heads , - 1 )
@@ -149,10 +149,8 @@ def linear(w,b,x):
149149 k ,v = [linear (qkvw [i ], qkvb [i ], hidden_states ) for i in range (1 ,3 )]
150150 query_layer , key_layer , value_layer = [self .transpose_for_scores (x ) for x in [q ,k ,v ]]
151151
152- q_bias = torch .nn .Parameter (torch .zeros ((self .all_head_size ), dtype = torch .float ))
153- v_bias = torch .nn .Parameter (torch .zeros ((self .all_head_size ), dtype = torch .float ))
154- query_layer += self .transpose_for_scores (q_bias .unsqueeze (0 ).unsqueeze (0 ))
155- value_layer += self .transpose_for_scores (v_bias .unsqueeze (0 ).unsqueeze (0 ))
152+ query_layer += self .transpose_for_scores (self .q_bias .unsqueeze (0 ).unsqueeze (0 ))
153+ value_layer += self .transpose_for_scores (self .v_bias .unsqueeze (0 ).unsqueeze (0 ))
156154
157155 rel_att = None
158156 # Take the dot product between "query" and "key" to get the raw attention scores.
0 commit comments