microsoft
diff --git a/‎DeBERTa/apps/models/masked_language_model.py‎
Lines changed: 16 additions & 13 deletions b/‎DeBERTa/apps/models/masked_language_model.py‎
Lines changed: 16 additions & 13 deletions
diff --git a/‎DeBERTa/apps/models/ner.py‎
Lines changed: 15 additions & 2 deletions b/‎DeBERTa/apps/models/ner.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎DeBERTa/apps/run.py‎
Lines changed: 7 additions & 3 deletions b/‎DeBERTa/apps/run.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎DeBERTa/apps/tasks/mlm_task.py‎
Lines changed: 15 additions & 1 deletion b/‎DeBERTa/apps/tasks/mlm_task.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎DeBERTa/apps/tasks/superglue_tasks.py‎
Lines changed: 1 addition & 1 deletion b/‎DeBERTa/apps/tasks/superglue_tasks.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎DeBERTa/apps/tasks/task.py‎
Lines changed: 6 additions & 0 deletions b/‎DeBERTa/apps/tasks/task.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎DeBERTa/deberta/bert.py‎
Lines changed: 3 additions & 0 deletions b/‎DeBERTa/deberta/bert.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎DeBERTa/deberta/cache_utils.py‎
Lines changed: 8 additions & 4 deletions b/‎DeBERTa/deberta/cache_utils.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎DeBERTa/deberta/disentangled_attention.py‎
Lines changed: 8 additions & 8 deletions b/‎DeBERTa/deberta/disentangled_attention.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎README.md‎
Lines changed: 42 additions & 13 deletions b/‎README.md‎
Lines changed: 42 additions & 13 deletions
@@ -31,6 +31,7 @@ class EnhancedMaskDecoder(torch.nn.Module):
   def __init__(self, config, vocab_size):
     super().__init__()
     self.config = config
+    self.position_biased_input = getattr(config, 'position_biased_input', True)
     self.lm_head = BertLMPredictionHead(config, vocab_size)
 
   def forward(self, ctx_layers, ebd_weight, target_ids, input_ids, input_mask, z_states, attention_mask, encoder, relative_pos=None):
@@ -56,19 +57,21 @@ def emd_context_layer(self, encoder_layers, z_states, attention_mask, encoder, t
       attention_mask = attention_mask.unsqueeze(1)
     target_mask = target_ids>0
     hidden_states = encoder_layers[-2]
-    layers = [encoder.layer[-1] for _ in range(2)]
-
-    z_states +=  hidden_states
-    query_mask = attention_mask
-    query_states = z_states
-    outputs = []
-    rel_embeddings = encoder.get_rel_embedding()
-
-    for layer in layers:
-      # TODO: pass relative pos ids
-      output = layer(hidden_states, query_mask, return_att=False, query_states = query_states, relative_pos=relative_pos, rel_embeddings = rel_embeddings)
-      query_states = output
-      outputs.append(query_states)
+    if not self.position_biased_input: 
+      layers = [encoder.layer[-1] for _ in range(2)]
+      z_states +=  hidden_states
+      query_states = z_states
+      query_mask = attention_mask
+      outputs = []
+      rel_embeddings = encoder.get_rel_embedding()
+
+      for layer in layers:
+        # TODO: pass relative pos ids
+        output = layer(hidden_states, query_mask, return_att=False, query_states = query_states, relative_pos=relative_pos, rel_embeddings = rel_embeddings)
+        query_states = output
+        outputs.append(query_states)
+    else:
+      outputs = [encoder_layers[-1]]
 
     _mask_index = (target_ids>0).view(-1).nonzero().view(-1)
     def flatten_states(q_states):
 
@@ -22,7 +22,8 @@
 class NERModel(NNModule):
   def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
     super().__init__(config)
-    self.bert = DeBERTa(config)
+    self._register_load_state_dict_pre_hook(self._pre_load_hook)
+    self.deberta = DeBERTa(config)
     self.num_labels = num_labels
     self.proj = nn.Linear(config.hidden_size, config.hidden_size)
     self.classifier = nn.Linear(config.hidden_size, self.num_labels)
@@ -31,7 +32,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
     self.apply(self.init_weights)
 
   def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):
-    outputs = self.bert(input_ids, token_type_ids=type_ids, attention_mask=input_mask, \
+    outputs = self.deberta(input_ids, token_type_ids=type_ids, attention_mask=input_mask, \
         position_ids=position_ids, output_all_encoded_layers=True)
     encoder_layers = outputs['hidden_states']
     cls = encoder_layers[-1]
@@ -52,3 +53,15 @@ def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, positi
             'logits' : logits,
             'loss' : loss
           }
+
+  def _pre_load_hook(self, state_dict, prefix, local_metadata, strict,
+      missing_keys, unexpected_keys, error_msgs):
+    new_state = dict()
+    bert_prefix = prefix + 'bert.'
+    deberta_prefix = prefix + 'deberta.'
+    for k in list(state_dict.keys()):
+      if k.startswith(bert_prefix):
+        nk = deberta_prefix + k[len(bert_prefix):]
+        value = state_dict[k]
+        del state_dict[k]
+        state_dict[nk] = value
@@ -46,7 +46,7 @@ def create_model(args, num_labels, model_class_fn):
   logger.info(f'Total parameters: {sum([p.numel() for p in model.parameters()])}')
   return model
 
-def train_model(args, model, device, train_data, eval_data):
+def train_model(args, model, device, train_data, eval_data, run_eval_fn):
   total_examples = len(train_data)
   num_train_steps = int(len(train_data)*args.num_train_epochs / args.train_batch_size)
   logger.info("  Training batch size = %d", args.train_batch_size)
@@ -56,7 +56,7 @@ def data_fn(trainer):
     return train_data, num_train_steps, None
 
   def eval_fn(trainer, model, device, tag):
-    results = run_eval(trainer.args, model, device, eval_data, tag, steps=trainer.trainer_state.steps)
+    results = run_eval_fn(trainer.args, model, device, eval_data, tag, steps=trainer.trainer_state.steps)
     eval_metric = np.mean([v[0] for k,v in results.items() if 'train' not in k])
     return eval_metric
 
@@ -285,11 +285,15 @@ def main(args):
   if not isinstance(device, torch.device):
     return 0
   model.to(device)
+  run_eval_fn = task.run_eval_fn()
+  if run_eval_fn is None:
+    run_eval_fn = run_eval
+  
   if args.do_eval:
     run_eval(args, model, device, eval_data, prefix=args.tag)
 
   if args.do_train:
-    train_model(args, model, device, train_data, eval_data)
+    train_model(args, model, device, train_data, eval_data, run_eval_fn)
 
   if args.do_predict:
     run_predict(args, model, device, test_data, prefix=args.tag)
 
@@ -48,7 +48,7 @@ def __init__(self, tokenizer, mask_lm_prob=0.15, max_seq_len=512, max_preds_per_
     if max_preds_per_seq is None:
       self.max_preds_per_seq = math.ceil(max_seq_len*mask_lm_prob /10)*10
 
-    self.max_gram = max_gram
+    self.max_gram = max(max_gram, 1)
     self.mask_window = int(1/mask_lm_prob) # make ngrams per window sized context
     self.vocab_words = list(tokenizer.vocab.keys())
 
@@ -168,6 +168,20 @@ def metrics_fn(logits, labels):
       preds = np.argmax(logits, axis=-1)
       acc = (preds==labels).sum()/len(labels)
       metrics =  OrderedDict(accuracy= acc)
+
+      logits = torch.tensor(logits).cuda()
+      labels = torch.tensor(labels).cuda().long()
+      chk = 1024
+      off = 0
+      loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
+      losses = []
+      while off<labels.size(0):
+        loss = loss_fn(logits[off:off+chk, :],  labels[off:off+chk])
+        losses.append(loss)
+        off += chk
+      loss = torch.cat(losses).mean()
+      ppl = loss.exp().cpu().item()
+      metrics['PPL'] = ppl
       return metrics
     return metrics_fn
 
 
@@ -733,7 +733,7 @@ def example_to_feature(self, tokenizer, example, max_seq_len=512, rng=None, mask
     # Max Enities spans 87 # 90
     max_entities = 110
     #max_entity_span = 110
-    max_entity_span = 90
+    max_entity_span = 180
     entities = example.entity_spans
     assert len(entities)<=max_entities, f'Entities number {len(entities)} exceeds the maxium allowed entities {max_entities}'
     entity_indice = []
 
@@ -64,6 +64,12 @@ def label2id(self, labelstr):
     label_dict = {l:i for i,l in enumerate(self.get_labels())}
     return label_dict[labelstr] if labelstr in label_dict else -1
 
+  def run_eval_fn(self):
+    return None
+
+  def run_pred_fn(self):
+    return None
+
   def get_metrics_fn(self):
     """Calcuate metrics based on prediction results"""
     def metrics_fn(logits, labels):
 
@@ -257,6 +257,9 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None, mask = None
       token_type_embeddings = self.token_type_embeddings(token_type_ids)
       embeddings += token_type_embeddings
 
+    if self.position_biased_input:
+      embeddings += position_embeddings
+
     if self.embedding_size != self.config.hidden_size:
       embeddings = self.embed_proj(embeddings)
 
 
@@ -37,10 +37,14 @@ def __init__(self, name, vocab, vocab_type, model='pytorch_model.bin', config='c
     'base-mnli': PretrainedModel('deberta-base-mnli', 'bpe_encoder.bin', 'gpt2'),
     'large-mnli': PretrainedModel('deberta-large-mnli', 'bpe_encoder.bin', 'gpt2'),
     'xlarge-mnli': PretrainedModel('deberta-xlarge-mnli', 'bpe_encoder.bin', 'gpt2'),
-    'xlarge-v2': PretrainedModel('deberta-xlarge-v2', 'spm.model', 'spm'),
-    'xxlarge-v2': PretrainedModel('deberta-xxlarge-v2', 'spm.model', 'spm'),
-    'xlarge-v2-mnli': PretrainedModel('deberta-xlarge-v2-mnli', 'spm.model', 'spm'),
-    'xxlarge-v2-mnli': PretrainedModel('deberta-xxlarge-v2-mnli', 'spm.model', 'spm')
+    'xlarge-v2': PretrainedModel('deberta-v2-xlarge', 'spm.model', 'spm'),
+    'xxlarge-v2': PretrainedModel('deberta-v2-xxlarge', 'spm.model', 'spm'),
+    'xlarge-v2-mnli': PretrainedModel('deberta-v2-xlarge-mnli', 'spm.model', 'spm'),
+    'xxlarge-v2-mnli': PretrainedModel('deberta-v2-xxlarge-mnli', 'spm.model', 'spm'),
+    'deberta-v3-small': PretrainedModel('deberta-v3-small', 'spm.model', 'spm'),
+    'deberta-v3-base': PretrainedModel('deberta-v3-base', 'spm.model', 'spm'),
+    'deberta-v3-large': PretrainedModel('deberta-v3-large', 'spm.model', 'spm'),
+    'mdeberta-v3-base': PretrainedModel('mdeberta-v3-base', 'spm.model', 'spm'),
   }
 
 def download_asset(url, name, tag=None, no_cache=False, cache_dir=None):
 
@@ -69,8 +69,8 @@ def transpose_for_scores(self, x, attention_heads):
     def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
         if query_states is None:
             query_states = hidden_states
-        query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads)
-        key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads)
+        query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads).float()
+        key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads).float()
         value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads)
 
         rel_att = None
@@ -83,14 +83,14 @@ def forward(self, hidden_states, attention_mask, return_att=False, query_states=
         if 'p2p' in self.pos_att_type:
             scale_factor += 1
         scale = math.sqrt(query_layer.size(-1)*scale_factor)
-        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2))/scale
+        attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)/scale)
         if self.relative_attention:
             rel_embeddings = self.pos_dropout(rel_embeddings)
             rel_att = self.disentangled_attention_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
 
         if rel_att is not None:
             attention_scores = (attention_scores + rel_att)
-        attention_scores = attention_scores
+        attention_scores = (attention_scores - attention_scores.max(dim=-1, keepdim=True).values.detach()).to(hidden_states)
         attention_scores = attention_scores.view(-1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1))
 
         # bxhxlxd
@@ -140,10 +140,10 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_
         # content->position
         if 'c2p' in self.pos_att_type:
             scale = math.sqrt(pos_key_layer.size(-1)*scale_factor)
-            c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
+            c2p_att = torch.bmm(query_layer/scale, pos_key_layer.transpose(-1, -2).to(query_layer))
             c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span*2-1)
             c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_pos.squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)]))
-            score += c2p_att/scale
+            score += c2p_att
 
         # position->content
         if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
@@ -159,11 +159,11 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_
                 pos_index = relative_pos[:, :, :, 0].unsqueeze(-1)
 
         if 'p2c' in self.pos_att_type:
-            p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2))
+            p2c_att = torch.bmm(key_layer/scale, pos_query_layer.transpose(-1, -2).to(key_layer))
             p2c_att = torch.gather(p2c_att, dim=-1, index=p2c_pos.squeeze(0).expand([query_layer.size(0), key_layer.size(-2), key_layer.size(-2)])).transpose(-1,-2)
             if query_layer.size(-2) != key_layer.size(-2):
                 p2c_att = torch.gather(p2c_att, dim=-2, index=pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2))))
-            score += p2c_att/scale
+            score += p2c_att
 
         # position->position
         if 'p2p' in self.pos_att_type:
 
@@ -3,6 +3,10 @@
 This repository is the official implementation of [ **DeBERTa**: **D**ecoding-**e**nhanced **BERT** with Disentangled **A**ttention ](https://arxiv.org/abs/2006.03654)
 
 ## News
+### 11/16/2021
+- The models of our new work [DeBERTa V3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing](https://arxiv.org/abs/2111.09543) are publicly available at [huggingface model hub](https://huggingface.co/models?other=deberta-v3) now. The new models are based on DeBERTa-V2 models by replacing MLM with ELECTRA-style objective plus gradient-disentangled embedding sharing which further improves the model efficiency.
+- Scripts for DeBERTa V3 model fine-tuning are added
+
 ### 3/31/2021
 - Masked language model task is added
 - SuperGLUE tasks is added
@@ -24,11 +28,6 @@ With DeBERTa 1.5B model, we surpass T5 11B model and human performance on SuperG
 ### 06/13/2020
 We released the pre-trained models, source code, and fine-tuning scripts to reproduce some of the experimental results in the paper. You can follow similar scripts to apply DeBERTa to your own experiments or applications. Pre-training scripts will be released in the next step. 
 
-## TODOs
-- [x] Add SuperGLUE tasks
-- [x] Add SiFT code
-- [x] Add Pretraining code
-
 
 ## Introduction to DeBERTa 
 DeBERTa (Decoding-enhanced BERT with disentangled attention) improves the BERT and RoBERTa models using two novel techniques. The first is the disentangled attention mechanism, where each word is represented using two vectors that encode its content and position, respectively, and the attention weights among words are computed using disentangled matrices on their contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency of model pre-training and performance of downstream tasks.
@@ -49,9 +48,15 @@ Our pre-trained models are packaged into zipped files. You can download them fro
 |[XLarge-MNLI](https://huggingface.co/microsoft/deberta-xlarge-mnli)|750M|1024|48|Fine-turned with MNLI|
 |[Large-MNLI](https://huggingface.co/microsoft/deberta-large-mnli)|400M|1024|24|Fine-turned with MNLI|
 |[Base-MNLI](https://huggingface.co/microsoft/deberta-base-mnli)|140M|768|12|Fine-turned with MNLI|
+|[DeBERTa-V3-Large](https://huggingface.co/microsoft/deberta-v3-large)<sup>2</sup>|418M|1024| 24| 128K new SPM vocab|
+|[DeBERTa-V3-Base](https://huggingface.co/microsoft/deberta-v3-base)<sup>2</sup>|183M|768| 12| 128K new SPM vocab|
+|[DeBERTa-V3-Small](https://huggingface.co/microsoft/deberta-v3-small)<sup>2</sup>|143M|768| 6| 128K new SPM vocab|
+|[mDeBERTa-V3-Base](https://huggingface.co/microsoft/mdeberta)<sup>2</sup>|280M|768| 12| 250K new SPM vocab, multi-lingual model with 102 languages|
 
 ## Note 
 - 1 This is the model(89.9) that surpassed **T5 11B(89.3) and human performance(89.8)** on **SuperGLUE** for the first time. 128K new SPM vocab. 
+- 2 These V3 DeBERTa models are deberta models pre-trained with ELECTRA-style objective plus gradient-disentangled embedding sharing which significantly improves the model efficiency.
+
 
 # Try the model
 
@@ -209,7 +214,20 @@ We present the dev results on SQuAD 1.1/2.0 and several GLUE benchmark tasks.
 | [DeBERTa-Large](https://huggingface.co/microsoft/deberta-large)<sup>1</sup> | 95.5/90.1 | 90.7/88.0 | 91.3/91.1| 96.5|95.3| 69.5| 91.0| 92.6/94.6| 92.3/- |92.8/92.5 |
 | [DeBERTa-XLarge](https://huggingface.co/microsoft/deberta-xlarge)<sup>1</sup> | -/-  | -/-  | 91.5/91.2| 97.0 | - | -    | 93.1   | 92.1/94.3    | -    |92.9/92.7|
 | [DeBERTa-V2-XLarge](https://huggingface.co/microsoft/deberta-v2-xlarge)<sup>1</sup>|95.8/90.8| 91.4/88.9|91.7/91.6| **97.5**| 95.8|71.1|**93.9**|92.0/94.2|92.3/89.8|92.9/92.9|
-|**[DeBERTa-V2-XXLarge](https://huggingface.co/microsoft/deberta-v2-xxlarge)<sup>1,2</sup>**|**96.1/91.4**|**92.2/89.7**|**91.7/91.9**|97.2|**96.0**|**72.0**| 93.5| **93.1/94.9**|**92.7/90.3** |**93.2/93.1** |
+|**[DeBERTa-V2-XXLarge](https://huggingface.co/microsoft/deberta-v2-xxlarge)<sup>1,2</sup>**|**96.1/91.4**|**92.2/89.7**|**91.7/91.9**|97.2|**96.0**|72.0| 93.5| **93.1/94.9**|**92.7/90.3** |**93.2/93.1** |
+|**[DeBERTa-V3-Large](https://huggingface.co/microsoft/deberta-v3-large)**|-/-|91.5/89.0|**91.8/91.9**|96.9|**96.0**|**75.3**| 92.7| 92.2/-|**93.0/-** |93.0/- |
+|[DeBERTa-V3-Base](https://huggingface.co/microsoft/deberta-v3-base)|-/-|88.4/85.4|90.6/90.7|-|-|-| -| -|- |- |
+|[DeBERTa-V3-Small](https://huggingface.co/microsoft/deberta-v3-base)|-/-|82.9/80.4|88.2/87.9|-|-|-| -| -|- |- |
+
+#### Fine-tuning on XNLI
+
+We present the dev results on XNLI with zero-shot crosslingual transfer setting, i.e. training with english data only, test on other languages.
+
+| Model        |avg | en |  fr| es  | de  | el  | bg  | ru  |tr   |ar   |vi   | th  | zh | hi  | sw  | ur  | 
+|--------------| ----|----|----|---- |--   |--   |--   | --  |--   |--   |--   | --  | -- | --  | --  | --  |
+| XLM-R-base   |76.2 |85.8|79.7|80.7 |78.7 |77.5 |79.6 |78.1 |74.2 |73.8 |76.5 |74.6 |76.7| 72.4| 66.5| 68.3|
+| [mDeBERTa-V3-Base](https://huggingface.co/microsoft/mdeberta-v3-base)|**79.8**+/-0.2|**88.2**|**82.6**|**84.4** |**82.7** |**82.3** |**82.4** |**80.8** |**79.5** |**78.5** |**78.1** |**76.4** |**79.5**| **75.9**| **73.9**| **72.4**|
+
 --------
 #### Notes.
  - <sup>1</sup> Following RoBERTa, for RTE, MRPC, STS-B, we fine-tune the tasks based on [DeBERTa-Large-MNLI](https://huggingface.co/microsoft/deberta-large-mnli), [DeBERTa-XLarge-MNLI](https://huggingface.co/microsoft/deberta-xlarge-mnli), [DeBERTa-V2-XLarge-MNLI](https://huggingface.co/microsoft/deberta-v2-xlarge-mnli), [DeBERTa-V2-XXLarge-MNLI](https://huggingface.co/microsoft/deberta-v2-xxlarge-mnli). The results of SST-2/QQP/QNLI/SQuADv2 will also be slightly improved when start from MNLI fine-tuned models, however, we only report the numbers fine-tuned from pretrained base models for those 4 tasks.
@@ -220,14 +238,25 @@ We present the dev results on SQuAD 1.1/2.0 and several GLUE benchmark tasks.
 Pengcheng He([email protected]), Xiaodong Liu([email protected]), Jianfeng Gao([email protected]), Weizhu Chen([email protected])
 
 # Citation
+``` latex
+@misc{he2021debertav3,
+      title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing}, 
+      author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
+      year={2021},
+      eprint={2111.09543},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
 ```
-@misc{he2020deberta,
-    title={DeBERTa: Decoding-enhanced BERT with Disentangled Attention},
-    author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
-    year={2020},
-    eprint={2006.03654},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
+
+``` latex
+@inproceedings{
+he2021deberta,
+title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION},
+author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
+booktitle={International Conference on Learning Representations},
+year={2021},
+url={https://openreview.net/forum?id=XPZIaotutsD}
 }
 ```