Update on "[Executorch][llm] Enable local global attention in export_llama script"

kimishpatel · kimishpatel · commit 22472c1b8da3 · 2025-05-08T09:09:50.000-07:00
Added a new option of --local_global_attention that takes in pattern of sizes to determine which layers are using local sliding window attention. For example, [0, 256, 256, 0, 256, 256] can be used for 6 layers transformer. Or you can also use [0, 256, 256] as pattern you want to repeat. Differential Revision: [D73891423](https://our.internmc.facebook.com/intern/diff/D73891423/) cc larryliu0820 mergennachin cccclai helunwencser jackzhxng [ghstack-poisoned]
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -385,7 +385,9 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--local_global_attention",
         type=parse_list_of_ints,
         default=None,
-        help="List of integers specifying local and global attention pattern, e.g., [0, 16, 0, 16].",
+        help="List of integers specifying local and global attention pattern, e.g., [0, 16, 0, 16] to specify that every other layer is sliding window of 16."
+        " [0, 16, 32] pattern specifes 2nd and 3rd layer has sliding window of 16 and 32 respecitvely. "
+        " [16] pattern specifies all layers have sliding window of 16.",
     )
 
     parser.add_argument("-2", "--fairseq2", action="store_true")
@@ -1332,7 +1334,7 @@ def _get_source_transforms(  # noqa
     if args.vulkan:
         transforms.append(replace_with_vulkan_rotary_emb)
 
-    if args.local_global_attention:
+    if getattr(args, "local_global_attention", None) is not None:
         transforms.append(
             partial(
                 replace_kv_cache_with_ring_kv_cache,
diff --git a/examples/models/llama/tests/TARGETS b/examples/models/llama/tests/TARGETS
@@ -85,3 +85,19 @@ python_unittest(
         "//executorch/examples/models/llama:sdpa",
     ],
 )
+
+python_unittest(
+    name = "test_export_llama_lib",
+    srcs = [
+        "test_export_llama_lib.py",
+    ],
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:export_library",
+        "//executorch/examples/models/llama:llama_transformer",
+        "//executorch/extension/pybindings:portable_lib",
+    ],
+)