[Llava] Add max_context_len CLI arg

GregoryComer · GregoryComer · commit 63c55341c158 · 2025-09-25T09:47:53.000-07:00
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -281,6 +281,7 @@ def create_llava_config_from_args(args):
     llm_config = LlmConfig()
 
     llm_config.model.use_sdpa_with_kv_cache = args.use_sdpa_with_kv_cache
+    llm_config.export.max_context_length = args.max_context_len
     llm_config.export.max_seq_length = args.max_seq_len
     llm_config.export.output_name = args.pte_name
     llm_config.debug.profile_memory = args.profile_memory
@@ -296,6 +297,12 @@ def main():
         action=BooleanOptionalAction,
         help="Use sdpa_with_kv_cache custom op in LLava text model.",
     )
+    parser.add_argument(
+        "--max-context-len",
+        default=768,
+        type=int,
+        help="Maximum context length for the text model.",
+    )
     parser.add_argument(
         "--max-seq-len",
         default=768,
@@ -325,12 +332,13 @@ def main():
     llm_config = create_llava_config_from_args(args)
 
     logging.info(
-        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {llm_config.model.use_sdpa_with_kv_cache}, max_seq_len: {llm_config.export.max_seq_length}"
+        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {llm_config.model.use_sdpa_with_kv_cache}, max_seq_len: {llm_config.export.max_seq_length}, max_context_len: {llm_config.export.max_context_length}"
     )
 
     llava_model = LlavaModel(
         use_sdpa_with_kv_cache_op=llm_config.model.use_sdpa_with_kv_cache,
         max_seq_len=llm_config.export.max_seq_length,
+        max_context_len=llm_config.export.max_context_length,
     )
 
     executorch_program = export_all(llava_model)
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
@@ -66,6 +66,7 @@ def __init__(
         llava_model: LlavaForConditionalGeneration,
         image_processor: CLIPImageProcessor,
         use_sdpa_with_kv_cache_op: bool = True,
+        max_context_len: int = 768,
         max_seq_len: int = 768,
     ):
         super().__init__()
@@ -87,6 +88,7 @@ def __init__(
             enable_dynamic_shape=True,  # allow parallel prefill
             use_sdpa_with_kv_cache_op=use_sdpa_with_kv_cache_op,  # use sdpa_with_kv_cache op
             use_hf_rope=True,
+            max_context_len=max_context_len,
             max_seq_len=max_seq_len,
         )
         self.text_model = construct_transformer(self.text_model_args)