add support to evalulate the model with attention sink

helunwencser · helunwencser · commit 33ce0cf9c005 · 2024-11-22T11:09:27.000-08:00
Differential Revision: [D66378384](https://our.internmc.facebook.com/intern/diff/D66378384/) [ghstack-poisoned]
diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py
@@ -155,6 +155,44 @@ def _model_call(self, inps):
         pass
 
 
+class AttentionSinkEvalWrapper(EagerEvalWrapper):
+    """
+    A wrapper class for evaluating the model with attention sink.
+    """
+
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        tokenizer: Union[SentencePieceTokenizer, Tiktoken],
+        sink_size: int,
+        window_size: int,
+        max_seq_length: Optional[int] = None,
+        use_kv_cache: bool = False,
+    ):
+        super().__init__(model, tokenizer, max_seq_length, use_kv_cache)
+        self.cache_size = sink_size + window_size
+        assert self._use_kv_cache, "Attention sink only works with kv cache."
+
+    def _model_call(self, inps):
+        # Given inps (tokens), return the logits
+
+        # Example:
+        # inps: Tensor of shape (1, N)
+        # logits: Tensor of shape (1, N, vocab_size)
+        _, seq_len = inps.shape
+        result = self._model(
+            inps[:, : min(seq_len, self.cache_size)],
+            torch.tensor([0], dtype=torch.int64, device=self.device),
+        )
+        for pos in range(min(seq_len, self.cache_size), seq_len):
+            logits = self._model(
+                inps[:, pos : pos + 1],
+                torch.tensor([pos], dtype=torch.int64, device=self.device),
+            )
+            result = torch.cat(result, logits[:, -1, :], dim=1)
+        return result
+
+
 def gen_eval_wrapper(
     model_name: str,
     args: argparse.ArgumentParser,
@@ -225,6 +263,25 @@ def gen_eval_wrapper(
         if args.output_eager_checkpoint_file is not None:  # pyre-ignore
             torch.save(model, args.output_eager_checkpoint_file)
 
+        if (use_attention_sink := args.use_attention_sink) is not None and (
+            attention_sink_eval_length := args.attention_sink_eval_length
+        ) is not None:  # pyre-ignore
+            attention_sink_params = use_attention_sink.split(",")
+            assert len(attention_sink_params) == 3
+            sink_size = int(attention_sink_params[0])
+            window_size = int(attention_sink_params[1])
+
+            assert args.max_seq_length == sink_size + window_size
+
+            return AttentionSinkEvalWrapper(
+                model=model,
+                tokenizer=tokenizer,
+                sink_size=sink_size,
+                window_size=window_size,
+                max_seq_length=attention_sink_eval_length,
+                use_kv_cache=args.use_kv_cache,
+            )
+
         return EagerEvalWrapper(
             model=model,
             tokenizer=tokenizer,
@@ -279,6 +336,12 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=None,
         help="Save the checkpoint after source transformations, for other evaluation platform to run the same checkpoint.",
     )
+    parser.add_argument(
+        "--attention_sink_eval_length",
+        type=int,
+        default=2048,
+        help="The maximum length of the sequence to evaluate with attention sink.",
+    )
 
     return parser