Export model with KV cache + runner for Torchtune models

jackzhxng · jackzhxng · commit 7d520029e1a5 · 2024-11-04T13:24:50.000-08:00
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -57,26 +57,12 @@ def __init__(
         max_batch_size: int,
         use_kv_cache: bool,
         vocab_size: int,
-        has_full_logits: bool = False,
         device: str = "cpu",
     ):
-        """
-        Constructor.
-
-        Args:
-        tokenizer_path: path to tokenizer.model file.
-        max_seq_len: max length of the output sequence, after which the output will be clipped.
-        max_batch_size: max batch size.
-        use_kv_cache: whether to use a KV cache.
-        vocab_size: number of items in the vocab.
-        has_full_logits: whether the model returns the full logits or only returns the last logit.
-        device: device to run the runner on.
-        """
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.use_kv_cache = use_kv_cache
         self.tokenizer = get_tokenizer(tokenizer_path)
-        self.has_full_logits = has_full_logits
         self.device = device
         assert vocab_size == self.tokenizer.n_words
 
@@ -95,7 +81,7 @@ def generate(  # noqa: C901
         top_p: float = 0.9,
         echo: bool = False,
     ) -> List[int]:
-        # prefill
+        # Prefill
         logits = self.forward(
             tokens=torch.tensor([prompt_tokens], dtype=torch.long, device=self.device),
             input_pos=(
@@ -105,11 +91,7 @@ def generate(  # noqa: C901
             ),
         )
 
-        current_token = next_token(logits[:, -1, :], temperature, top_p)
-        if self.has_full_logits:
-            current_token = next_token(logits[:, -1, :], temperature, top_p)
-        else:
-            current_token = next_token(logits, temperature, top_p)
+        current_token = next_token(logits, temperature, top_p)
         tokens = prompt_tokens + [current_token]
 
         i = 0
@@ -129,12 +111,7 @@ def generate(  # noqa: C901
                     tokens=torch.tensor([tokens], dtype=torch.long, device=self.device),
                 )
 
-            # If the logits aren't already clipped to only contain the last logit, clip them.
-            if self.has_full_logits:
-                current_token = next_token(logits[:, -1, :], temperature, top_p)
-            else:
-                current_token = next_token(logits, temperature, top_p)
-
+            current_token = next_token(logits, temperature, top_p)
             if current_token == self.tokenizer.eos_id or (
                 hasattr(self.tokenizer, "stop_tokens")
                 and current_token in self.tokenizer.stop_tokens
diff --git a/examples/models/llama3_2_vision/model.py b/examples/models/llama3_2_vision/model.py
@@ -134,12 +134,12 @@ def __init__(self, **kwargs):
 
             self.model_ = prune_output_vocab(self.model_, output_prune_map)
 
-        # if self.use_kv_cache:
-        #     print("Setting up KV cache on the model...")
-        #     self.model_.setup_caches(
-        #         batch_size=1,
-        #         dtype=self.dtype,
-        #     )
+        if self.use_kv_cache:
+            print("Setting up KV cache on the model...")
+            self.model_.setup_caches(
+                batch_size=1,
+                dtype=self.dtype,
+            )
 
     def get_eager_model(self) -> torch.nn.Module:
         if self.dtype:
@@ -148,15 +148,16 @@ def get_eager_model(self) -> torch.nn.Module:
             return self.model_.to(torch.float16)
 
     def get_example_inputs(self):
-        return (torch.ones(1, 64, dtype=torch.long),)
+        return (torch.ones(1, 32, dtype=torch.long),)
 
     def get_example_kwarg_inputs(self):
-        # TODO: add input_pos and mask when after making cache work.
+        # For export we must use the prefill versions of the
+        # causal mask and input_pos.
         return {
-            # "mask": self.causal_mask[None, 64, None, :],
+            "mask": self.causal_mask[None, :32],
             # "encoder_input": None,
             # "encoder_mask": None,
-            # "input_pos": self.input_pos[None, 64]
+            "input_pos": self.input_pos[None, :32]
         }
 
     def get_dynamic_shapes(self):
@@ -166,7 +167,7 @@ def get_dynamic_shapes(self):
             "tokens": {0: batch_size, 1: dim_seq_len},
             # "encoder_input": {0: 1, 1: dim_enc, 2: 4096},
             # "encoder_mask": {0: 1, 1: dim, 2: dim_enc},
-            # "mask": {0: batch_size, 1: dim_seq_len, 2: self.max_seq_len},
-            # "input_pos" : {0: batch_size, 1: dim_seq_len},
+            "mask": {0: batch_size, 1: dim_seq_len, 2: dim_seq_len},
+            "input_pos" : {0: batch_size, 1: dim_seq_len},
         }
         return dynamic_shapes
diff --git a/examples/models/model_factory.py b/examples/models/model_factory.py
@@ -44,7 +44,7 @@ def create_model(
             model = model_class(**kwargs)
             example_kwarg_inputs = None
             dynamic_shapes = None
-            if hasattr(model, "get_example_kwarg_inputs()"):
+            if hasattr(model, "get_example_kwarg_inputs"):
                 example_kwarg_inputs = model.get_example_kwarg_inputs()
             if hasattr(model, "get_dynamic_shapes"):
                 dynamic_shapes = model.get_dynamic_shapes()