Add DistilRoberta Model to OSS (cherry picked commit) (#1998)

rshraga · Roman Shraga · web-flow · commit 1020fae3e278 · 2022-12-01T16:53:16.000-05:00
* Add DistilRoberta Model to OSS Summary: This diff adds a DistilRoberta to torchtext oss This model is a distilled version of the full Roberta Base model. Weights for this model are taken from HF https://huggingface.co/distilroberta-base The state dict is loaded and modified to work with the internal Roberta implementation here: https://www.internalfb.com/intern/anp/view/?id=2794739 Comparison of DistilRoberta to Roberta-base on the GLUE benchmark (as reported here https://github.com/huggingface/transformers/blob/main/examples/research_projects/distillation/README.md) {F806809901} DistilRoBERTa reaches 95% of RoBERTa-base's performance on GLUE while being twice faster and 35% smaller. Reviewed By: Nayef211 Differential Revision: D41590601 fbshipit-source-id: 394d10c45bbee5d2e71e14e30edf9b1a9d9380e6 * Add DistilRoberta Model to OSS Summary: This diff adds a DistilRoberta to torchtext oss This model is a distilled version of the full Roberta Base model. Weights for this model are taken from HF https://huggingface.co/distilroberta-base The state dict is loaded and modified to work with the internal Roberta implementation here: https://www.internalfb.com/intern/anp/view/?id=2794739 Comparison of DistilRoberta to Roberta-base on the GLUE benchmark (as reported here https://github.com/huggingface/transformers/blob/main/examples/research_projects/distillation/README.md) {F806809901} DistilRoBERTa reaches 95% of RoBERTa-base's performance on GLUE while being twice faster and 35% smaller. Reviewed By: Nayef211 Differential Revision: D41590601 fbshipit-source-id: 394d10c45bbee5d2e71e14e30edf9b1a9d9380e6 Co-authored-by: Roman Shraga <rshraga@meta.com>
diff --git a/README.rst b/README.rst
@@ -114,6 +114,7 @@ Models
 The library currently consist of following pre-trained models:
 
 * RoBERTa: `Base and Large Architecture <https://github.com/pytorch/fairseq/tree/main/examples/roberta#pre-trained-models>`_
+* `DistilRoBERTa <https://github.com/huggingface/transformers/blob/main/examples/research_projects/distillation/README.md>`_
 * XLM-RoBERTa: `Base and Large Architure <https://github.com/pytorch/fairseq/tree/main/examples/xlmr#pre-trained-models>`_
 
 Tokenizers
diff --git a/test/integration_tests/test_models.py b/test/integration_tests/test_models.py
@@ -4,6 +4,7 @@
 from torchtext.models import (
     ROBERTA_BASE_ENCODER,
     ROBERTA_LARGE_ENCODER,
+    ROBERTA_DISTILLED_ENCODER,
     XLMR_BASE_ENCODER,
     XLMR_LARGE_ENCODER,
 )
@@ -15,13 +16,7 @@
     "xlmr_large": XLMR_LARGE_ENCODER,
     "roberta_base": ROBERTA_BASE_ENCODER,
     "roberta_large": ROBERTA_LARGE_ENCODER,
-}
-
-BUNDLERS = {
-    "xlmr_base": XLMR_BASE_ENCODER,
-    "xlmr_large": XLMR_LARGE_ENCODER,
-    "roberta_base": ROBERTA_BASE_ENCODER,
-    "roberta_large": ROBERTA_LARGE_ENCODER,
+    "roberta_distilled": ROBERTA_DISTILLED_ENCODER,
 }
 
 
@@ -32,6 +27,7 @@
         ("xlmr_large",),
         ("roberta_base",),
         ("roberta_large",),
+        ("roberta_distilled",),
     ],
 )
 class TestRobertaEncoders(TorchtextTestCase):
diff --git a/test/torchtext_unittest/asset/roberta.distilled.output.pt b/test/torchtext_unittest/asset/roberta.distilled.output.pt
diff --git a/torchtext/models/roberta/__init__.py b/torchtext/models/roberta/__init__.py
@@ -1,6 +1,7 @@
 from .bundler import (
     ROBERTA_BASE_ENCODER,
     ROBERTA_LARGE_ENCODER,
+    ROBERTA_DISTILLED_ENCODER,
     RobertaBundle,
     XLMR_BASE_ENCODER,
     XLMR_LARGE_ENCODER,
@@ -16,4 +17,5 @@
     "XLMR_LARGE_ENCODER",
     "ROBERTA_BASE_ENCODER",
     "ROBERTA_LARGE_ENCODER",
+    "ROBERTA_DISTILLED_ENCODER",
 ]
diff --git a/torchtext/models/roberta/bundler.py b/torchtext/models/roberta/bundler.py
@@ -294,3 +294,40 @@ def encoderConf(self) -> RobertaEncoderConf:
 
     Please refer to :func:`torchtext.models.RobertaBundle` for the usage.
     """
+
+
+ROBERTA_DISTILLED_ENCODER = RobertaBundle(
+    _path=urljoin(_TEXT_BUCKET, "roberta.distilled.encoder.pt"),
+    _encoder_conf=RobertaEncoderConf(
+        num_encoder_layers=6,
+        padding_idx=1,
+    ),
+    transform=lambda: T.Sequential(
+        T.GPT2BPETokenizer(
+            encoder_json_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_encoder.json"),
+            vocab_bpe_path=urljoin(_TEXT_BUCKET, "gpt2_bpe_vocab.bpe"),
+        ),
+        T.VocabTransform(load_state_dict_from_url(urljoin(_TEXT_BUCKET, "roberta.vocab.pt"))),
+        T.Truncate(510),
+        T.AddToken(token=0, begin=True),
+        T.AddToken(token=2, begin=False),
+    ),
+)
+
+ROBERTA_DISTILLED_ENCODER.__doc__ = """
+    Roberta Encoder with Distilled Weights
+
+    DistilRoBERTa is trained using knowledge distillation, a technique to compress a large
+    model called the teacher into a smaller model called the student. By distillating RoBERTa,
+    a smaller and faster Transformer model is obtained while maintaining most of the performance.
+
+    DistilRoBERTa was pretrained solely on OpenWebTextCorpus, a reproduction of OpenAI's WebText dataset.
+    On average DistilRoBERTa is twice as fast as RoBERTa Base.
+
+    Originally published by Hugging Face under the Apache 2.0 License
+    and redistributed with the same license.
+    [`License <https://www.apache.org/licenses/LICENSE-2.0>`__,
+    `Source <https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation>`__]
+
+    Please refer to :func:`torchtext.models.RobertaBundle` for the usage.
+    """