[Llava] Add max_context_len CLI arg (#14599)

GregoryComer · web-flow · commit bc755c6e0159 · 2025-09-25T13:24:27.000-07:00
### Summary Add a required max_context_len argument to the Llava example model export. When set to 768, this reduces the memory consumption (~6GiB -> ~4.8GiB RSS) at the cost of a smaller context length and thus fixes #14474. ### Test plan Ran ./test_llava.sh and validated the reported memory consumption on an x86 Linux machine. ``` I 00:00:18.433471 executorch:main.cpp:172] Starting generation... I 00:00:18.433500 executorch:multimodal_runner.cpp:95] RSS after loading model: 4746.726562 MiB (0 if unsupported) I 00:00:18.433554 executorch:multimodal_runner.cpp:119] Prefilling input 0/3, type: text I 00:00:19.484581 executorch:multimodal_runner.cpp:119] Prefilling input 1/3, type: image I 00:00:19.484710 executorch:multimodal_prefiller.cpp:83] Image tensor dim: 3, dtype: Byte I 00:00:30.442685 executorch:multimodal_runner.cpp:119] Prefilling input 2/3, type: text I 00:00:30.951938 executorch:multimodal_runner.cpp:138] RSS after multimodal input processing: 4847.933594 MiB (0 if unsupported) I 00:00:30.952000 executorch:multimodal_runner.cpp:148] Max new tokens resolved: 153, pos_ 615, max_context_len 768 ```
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
@@ -107,7 +107,7 @@ cmake_build_llava_runner_for_android() {
 # only export the one without custom op for now since it's
 export_llava() {
     echo "Starting to export Llava. This will take about 6 mins"
-    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts --max-context-len 768
 }
 
 # Download a new image
diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md
@@ -48,7 +48,7 @@ Prerequisite: run `install_executorch.sh` to install ExecuTorch and run
 `examples/models/llava/install_requirements.sh` to install dependencies.
 
 ```bash
-python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
+python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts --max-context-len=768
 ```
 
 Currently the whole export process takes about 6 minutes. We also provide a
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -281,6 +281,7 @@ def create_llava_config_from_args(args):
     llm_config = LlmConfig()
 
     llm_config.model.use_sdpa_with_kv_cache = args.use_sdpa_with_kv_cache
+    llm_config.export.max_context_length = args.max_context_len
     llm_config.export.max_seq_length = args.max_seq_len
     llm_config.export.output_name = args.pte_name
     llm_config.debug.profile_memory = args.profile_memory
@@ -296,6 +297,12 @@ def main():
         action=BooleanOptionalAction,
         help="Use sdpa_with_kv_cache custom op in LLava text model.",
     )
+    parser.add_argument(
+        "--max-context-len",
+        required=True,
+        type=int,
+        help="Maximum context length for the text model.",
+    )
     parser.add_argument(
         "--max-seq-len",
         default=768,
@@ -325,12 +332,13 @@ def main():
     llm_config = create_llava_config_from_args(args)
 
     logging.info(
-        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {llm_config.model.use_sdpa_with_kv_cache}, max_seq_len: {llm_config.export.max_seq_length}"
+        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {llm_config.model.use_sdpa_with_kv_cache}, max_seq_len: {llm_config.export.max_seq_length}, max_context_len: {llm_config.export.max_context_length}"
     )
 
     llava_model = LlavaModel(
         use_sdpa_with_kv_cache_op=llm_config.model.use_sdpa_with_kv_cache,
         max_seq_len=llm_config.export.max_seq_length,
+        max_context_len=llm_config.export.max_context_length,
     )
 
     executorch_program = export_all(llava_model)
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
@@ -66,6 +66,7 @@ def __init__(
         llava_model: LlavaForConditionalGeneration,
         image_processor: CLIPImageProcessor,
         use_sdpa_with_kv_cache_op: bool = True,
+        max_context_len: int = 768,
         max_seq_len: int = 768,
     ):
         super().__init__()
@@ -87,6 +88,7 @@ def __init__(
             enable_dynamic_shape=True,  # allow parallel prefill
             use_sdpa_with_kv_cache_op=use_sdpa_with_kv_cache_op,  # use sdpa_with_kv_cache op
             use_hf_rope=True,
+            max_context_len=max_context_len,
             max_seq_len=max_seq_len,
         )
         self.text_model = construct_transformer(self.text_model_args)
@@ -300,8 +302,11 @@ def forward(
 
 
 class LlavaModel(EagerModelBase):
-    def __init__(self, use_sdpa_with_kv_cache_op=True, max_seq_len=768):
+    def __init__(
+        self, use_sdpa_with_kv_cache_op=True, max_seq_len=768, max_context_len=768
+    ):
         self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op
+        self.max_context_len = max_context_len
         self.max_seq_len = max_seq_len
         self.model = LlavaForConditionalGeneration.from_pretrained(
             "llava-hf/llava-1.5-7b-hf",
@@ -348,6 +353,7 @@ def get_eager_model(self):
             self.model,
             self.image_processor,
             self.use_sdpa_with_kv_cache_op,
+            self.max_context_len,
             self.max_seq_len,
         )
         model.to(dtype=torch.float32)

Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ cmake_build_llava_runner_for_android() {`
`107`	`107`	`# only export the one without custom op for now since it's`
`108`	`108`	`export_llava() {`
`109`	`109`	`echo "Starting to export Llava. This will take about 6 mins"`
`110`		`- $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts`
	`110`	`+ $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts --max-context-len 768`
`111`	`111`	`}`
`112`	`112`
`113`	`113`	`# Download a new image`