Perplexity Speedup (#4108)

emibaylor · web-flow · commit e90a7d418269 · 2022-04-20T08:54:41.000-04:00
* perplexity update checkpoint

* perplexity speedup code checkpoint

* enable batching for faster calculations

* assert input strings are long enough

* clean up code for readability

* fix vocab size, style

* update examples

* fix padding token issue, which fixes output values

* update perplexity examples

* fix ppl examples

* add example caching fix

* update ppl examples

* suppress warnings in examples

* edit ppl example

* edit ppl example

* edit ppl example

* edit ppl example

* fix example output

* remove perplexity testing script
diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 """Perplexity Metric."""
 
+import numpy as np
 import torch
+from torch.nn import CrossEntropyLoss
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 import datasets
@@ -53,23 +55,30 @@
         >>> perplexity = datasets.load_metric("perplexity")
         >>> input_texts = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]
         >>> results = perplexity.compute(model_id='gpt2',
-        ...                              input_texts=input_texts,
-        ...                              stride=1)
-        >>> round(results["perplexity"], 1)
-        78.2
+        ...                              add_start_token=False,
+        ...                              input_texts=input_texts) # doctest:+ELLIPSIS
+        >>> print(list(results.keys()))
+        ['perplexities', 'mean_perplexity']
+        >>> print(round(results["mean_perplexity"], 2))
+        78.22
+        >>> print(round(results["perplexities"][0], 2))
+        11.11
 
     Example 2:
         >>> perplexity = datasets.load_metric("perplexity")
         >>> input_texts = datasets.load_dataset("wikitext",
         ...                                     "wikitext-2-raw-v1",
-        ...                                     split="test")["text"][:10] # doctest:+ELLIPSIS
+        ...                                     split="test")["text"][:50] # doctest:+ELLIPSIS
         [...]
+        >>> input_texts = [s for s in input_texts if s!='']
         >>> results = perplexity.compute(model_id='gpt2',
-        ...                              input_texts=input_texts,
-        ...                              stride=256)
-        >>> round(results["perplexity"], 1)
-        117.9
-
+        ...                              input_texts=input_texts) # doctest:+ELLIPSIS
+        >>> print(list(results.keys()))
+        ['perplexities', 'mean_perplexity']
+        >>> print(round(results["mean_perplexity"], 2))
+        1977.55
+        >>> print(round(results["perplexities"][0], 2))
+        1349.56
 """
 
 
@@ -88,7 +97,7 @@ def _info(self):
             reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
         )
 
-    def _compute(self, input_texts, model_id, stride=512, device=None):
+    def _compute(self, input_texts, model_id, batch_size: int = 16, add_start_token: bool = True, device=None):
 
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
@@ -100,51 +109,79 @@ def _compute(self, input_texts, model_id, stride=512, device=None):
         model = AutoModelForCausalLM.from_pretrained(model_id)
         model = model.to(device)
 
-        tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="<PAD>")
-
-        encodings = tokenizer(input_texts, padding=True, return_tensors="pt", return_special_tokens_mask=True).to(
-            device
-        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # if batch_size > 1 (which generally leads to padding being required), and
+        # if there is not an already assigned pad_token, assign an existing
+        # special token to also be the padding token
+        if tokenizer.pad_token is None and batch_size > 1:
+            existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
+            # check that the model already has at least one special token defined
+            assert (
+                len(existing_special_tokens) > 0
+            ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
+            # assign one of the special tokens to also be the pad token
+            tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
+
+        if add_start_token:
+            # leave room for <BOS> token to be added:
+            assert (
+                tokenizer.bos_token is not None
+            ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
+            max_tokenized_len = model.config.max_length - 1
+        else:
+            max_tokenized_len = model.config.max_length
+
+        encodings = tokenizer(
+            input_texts,
+            add_special_tokens=False,
+            padding=True,
+            truncation=True,
+            max_length=max_tokenized_len,
+            return_tensors="pt",
+            return_attention_mask=True,
+        ).to(device)
 
         encoded_texts = encodings["input_ids"]
-        special_tokens_masks = encodings["special_tokens_mask"]
+        attn_masks = encodings["attention_mask"]
 
-        max_model_length = model.config.n_positions
+        # check that each input is long enough:
+        if add_start_token:
+            assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
+        else:
+            assert torch.all(
+                torch.ge(attn_masks.sum(1), 2)
+            ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."
 
         ppls = []
+        loss_fct = CrossEntropyLoss(reduction="none")
 
-        for text_index in logging.tqdm(range(0, len(encoded_texts))):
-            encoded_text = encoded_texts[text_index]
-            special_tokens_mask = special_tokens_masks[text_index]
-
-            encoded_text_length = len(encoded_text) - special_tokens_mask.sum()
-
-            nlls = []
-
-            target_index = max(1, min(stride - 1, encoded_text_length - 1))
-
-            while target_index < encoded_text_length:
-                start_index = max(0, target_index - (max_model_length - 1))
-
-                input_ids = encoded_text[start_index : target_index + 1]
-
-                target_ids = input_ids.clone()
-                target_ids[:-1] = -100
+        for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
+            end_index = min(start_index + batch_size, len(encoded_texts))
+            encoded_batch = encoded_texts[start_index:end_index]
+            attn_mask = attn_masks[start_index:end_index]
 
-                attn_mask = torch.ones(len(input_ids)).to(device)
-                attn_mask[-1] = 0
+            if add_start_token:
+                bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
+                encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
+                attn_mask = torch.cat(
+                    [torch.zeros(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
+                )
 
-                with torch.no_grad():
-                    outputs = model(input_ids, labels=target_ids, attention_mask=attn_mask)
-                    neg_log_likelihood = outputs[0]
+            labels = encoded_batch
 
-                nlls.append(neg_log_likelihood)
+            with torch.no_grad():
+                out_logits = model(encoded_batch, attention_mask=attn_mask).logits
 
-                target_index += stride
+            shift_logits = out_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
 
-            if len(nlls) > 0:
-                ppls.append(torch.exp2(torch.mean(torch.stack(nlls))))
+            perplexity_batch = torch.exp2(
+                (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
+                / shift_attention_mask_batch.sum(1)
+            )
 
-        ppl = torch.mean(torch.stack(ppls))
+            ppls += perplexity_batch.tolist()
 
-        return {"perplexity": float(ppl)}
+        return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}