Add kv cache args in get_example_inputs_kvcache_sdpa

YIWENX14 · YIWENX14 · commit 41ff413ff058 · 2025-01-28T13:05:21.000-08:00
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -300,7 +300,7 @@ def get_example_inputs(self):
     # assumption is the custom op doesnt support dynamic shape right now. It might but its untested so lets first get static shape working
     def get_example_inputs_kvcache_sdpa(self):
         if self.enable_dynamic_shape:
-            return (
+            args = (
                 torch.tensor(
                     [[0 for _ in range(self.static_seq_length)]], dtype=torch.long
                 ),
@@ -315,18 +315,19 @@ def get_example_inputs_kvcache_sdpa(self):
                     [0], dtype=torch.long
                 ),  # start_pos, what token of output are we on.
             )
-            if self.decode_kv_cache_as_io:
-                args = args + (
-                    # (n_layers, max_batch_size, n_heads, max_seq_length, head_dim)
-                    torch.zeros(self._cache_shape, dtype=torch.float16),  # k-cache
-                    torch.zeros(self._cache_shape, dtype=torch.float16),  # v-cache
-                )
+            
+        if self.decode_kv_cache_as_io:
+            args = args + (
+                # (n_layers, max_batch_size, n_heads, max_seq_length, head_dim)
+                torch.zeros(self._cache_shape, dtype=torch.float16),  # k-cache
+                torch.zeros(self._cache_shape, dtype=torch.float16),  # v-cache
+            )
 
-            if self.use_additive_kv_cache_update:
-                args = args + (
-                    torch.zeros(self._cache_pos_mask_shape, dtype=torch.float16),
-                )
-            return args
+        if self.use_additive_kv_cache_update:
+            args = args + (
+                torch.zeros(self._cache_pos_mask_shape, dtype=torch.float16),
+            )
+        return args
 
     def _transform_for_pre_quantization(self, checkpoint, model_args):
         assert hasattr(self.args, "preq_mode"), "preq_mode must be specified"