Update on "[Excutorch][Llama] Decouple input sequence length from kv cache context length"

kimishpatel · kimishpatel · commit 58245afe962a · 2025-01-27T19:58:10.000-08:00
Decouple max sequence length, for shape dynamism in torch.export, from sequence length used for kv cache sizing. Differential Revision: [D68448334](https://our.internmc.facebook.com/intern/diff/D68448334/) [ghstack-poisoned]
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
@@ -45,7 +45,7 @@ def __init__(
         else:
             self.apply_rotary_emb_to_k = apply_rotary_emb_to_k
         self.max_context_length = window_size + sink_size
-        assert self.max_context_length == self.params.max_context_length
+        assert self.max_context_length == self.params.max_context_len
         self.eviction_batch_size = eviction_batch_size
         self.position_shift = 0