Update

GregoryComer · GregoryComer · commit 56f84cc69a33 · 2025-07-28T16:20:53.000-07:00
[ghstack-poisoned]
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -647,18 +647,14 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM)
   list(APPEND _executorch_extensions tokenizers)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
-  list(APPEND _executorch_extensions extension_llm_apple)
-endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
   list(APPEND _executorch_extensions extension_llm_runner)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
+  list(APPEND _executorch_extensions extension_llm_apple)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
@@ -802,7 +798,6 @@ if(EXECUTORCH_BUILD_VGF)
   list(APPEND _executorch_backends vgf_backend)
 endif()
 
-
 # Top-level interface targets.
 
 # A target containing all configured backends.
@@ -869,17 +864,10 @@ else()
 endif()
 target_link_libraries(executorch_kernels INTERFACE ${_executorch_kernels})
 
-install(
-  TARGETS executorch_backends executorch_extensions executorch_kernels
-  INCLUDES
-  DESTINATION ${_common_include_directories}
-)
-
 if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   # Baseline libraries that executor_runner will link against.
   set(_executor_runner_libs executorch extension_evalue_util
-                            extension_runner_util gflags
-                            executorch_backends
+                            extension_runner_util gflags executorch_backends
   )
 
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py
@@ -622,9 +622,10 @@ def get_serialized_buffer_index(
         )
 
         external_tag = tensor.meta.get("delegate_constant_tag", None)
-        logging.info(
-            f"Adding constant data with name {tensor.name}, key {named_key} and external_tag {external_tag} to named_data_store"
-        )
+        if external_tag is not None:
+            logging.info(
+                f"Adding constant data with name {tensor.name}, key {named_key} and external_tag {external_tag} to named_data_store"
+            )
         self._named_data_store.add_named_data(
             named_key,
             bytes(array),
diff --git a/devtools/etrecord/tests/etrecord_test.py b/devtools/etrecord/tests/etrecord_test.py
@@ -92,6 +92,17 @@ def check_graph_closeness(self, graph_a, graph_b):
                 self.assertEqual(
                     node_a.meta.get("debug_handle"), node_b.meta.get("debug_handle")
                 )
+                from_node_a = node_a.meta.get("from_node")
+                from_node_b = node_b.meta.get("from_node")
+
+                if from_node_a is None:
+                    self.assertIsNone(from_node_b)
+                else:
+                    self.assertIsNotNone(from_node_b)
+                    for node_source_a, node_source_b in zip(from_node_a, from_node_b):
+                        self.assertEqual(
+                            node_source_a.to_dict(), node_source_b.to_dict()
+                        )
 
     def test_etrecord_generation(self):
         captured_output, edge_output, et_output = self.get_test_model()
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -239,6 +239,18 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="checkpoint directory. Use with a sharded checkpoint, not for the standard llama2 model. Note, checkpoint_dir takes precedence over checkpoint if both are set.",
     )
 
+    parser.add_argument(
+        "--adapter_checkpoint",
+        required=False,
+        help="Path to the adapter.pt file from torchtune. Used if the model has trained LoRA adapters. Must provide adapter_config.json",
+    )
+
+    parser.add_argument(
+        "--adapter_config",
+        required=False,
+        help="Path to the adapter_config.json file. Used if the model has trained LoRA adapters. Must provide adapter_checkpoint.",
+    )
+
     parser.add_argument(
         "--use_qnn_sha",
         action="store_true",
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -46,6 +46,13 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
         checkpoint_dir = self.llm_config.base.checkpoint_dir
         params_path = self.llm_config.base.params
 
+        # Adapter checkpoint and config.
+        adapter_checkpoint_path = self.llm_config.base.adapter_checkpoint
+        adapter_config_path = self.llm_config.base.adapter_config
+        assert (adapter_checkpoint_path is None and adapter_config_path is None) or (
+            adapter_checkpoint_path is not None and adapter_config_path is not None
+        ), "Both adapter_checkpoint_path and adapter_config_path must be specified or neither must be specified."
+
         self.use_kv_cache = self.llm_config.model.use_kv_cache
         self.use_sdpa_with_kv_cache_op = self.llm_config.model.use_sdpa_with_kv_cache
         self.generate_full_logits = self.llm_config.debug.generate_full_logits
@@ -129,6 +136,20 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
             with open(params_path, "r") as f:
                 params = json.loads(f.read())
 
+        # Get adapter checkpoint and config.
+        adapter_checkpoint = {}
+        adapter_config = {}
+        if adapter_checkpoint_path:
+            adapter_checkpoint = torch.load(
+                adapter_checkpoint_path, map_location=device, mmap=True
+            )
+            from torchtune.models import convert_weights
+
+            adapter_checkpoint = convert_weights.tune_to_meta(adapter_checkpoint)
+            with open(adapter_config_path, "r") as f:
+                adapter_config = json.loads(f.read())
+            checkpoint.update(adapter_checkpoint)
+
         output_prune_map = None
         if self.output_prune_map_path is not None:
             with open(self.output_prune_map_path, "r") as f:
@@ -153,6 +174,7 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
             output_prune_map=output_prune_map,
             enable_dynamic_shape=self.enable_dynamic_shape,
             **params,
+            **adapter_config,
         )
 
         if model_args.use_scaled_rope:
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -59,7 +59,7 @@ class ModelArgs:
     lora_args: Optional[dict] = None
 
     # LoRA arguments to set up a LoRA inference model.
-    # These arguments come directly from a torchtune LoRA config.
+    # These arguments come directly from a torchtune adapter_config.json file.
     r: Optional[int] = None  # Rank.
     lora_alpha: Optional[int] = None  # Alpha.
     # Eg. q_proj, k_proj, v_proj, output_proj
diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
@@ -602,6 +602,39 @@ class StaticAttentionIOManager {
     }
   }
 
+  /**
+   * Prefill helper. Run multiple inferences as needed depending on the length
+   * of the prompt and method's input length. Returns the position in the output
+   * that corresponds to the end of the prompt during the last inference.
+   */
+  template <typename TokenT>
+  size_t prefill(
+      executorch::runtime::Span<TokenT> tokens,
+      executorch::runtime::Span<TokenT> input_buffer,
+      executorch::runtime::Method& method) {
+    size_t input_len = input_buffer.size();
+    get_mask(input_buffer.size()).set_causal_mask();
+
+    size_t batch_len = 0;
+    for (size_t i = 0; i < tokens.size(); i += input_len) {
+      batch_len = std::min(input_len, tokens.size() - i);
+      std::copy(&tokens[i], &tokens[i + batch_len], input_buffer.begin());
+      prepare(method);
+      ET_CHECK(method.execute() == executorch::runtime::Error::Ok);
+      update(
+          method,
+          config_.k_cache_output_indices,
+          config_.v_cache_output_indices,
+          batch_len);
+    }
+    return batch_len - 1;
+  }
+
+  /**
+   * Decode helper. The `sample` argument is called after each inference and
+   * should retrieve the logits from the `method` argument's output and return
+   * the sampled token.
+   */
   template <typename TokenT>
   std::vector<TokenT> decode(
       TokenT prev_tok,
@@ -632,6 +665,11 @@ class StaticAttentionIOManager {
     return generated_tokens;
   }
 
+  /**
+   * Lookahead decode helper. The `sample` argument is called after each
+   * inference and should retrieve the logits from the `method` argument's
+   * output and return the sampled token for all output positions.
+   */
   template <typename TokenT>
   std::vector<TokenT> lookahead_decode(
       TokenT prev_tok,
diff --git a/exir/serde/serialize.py b/exir/serde/serialize.py
@@ -41,6 +41,7 @@
 )
 from torch._export.verifier import load_verifier
 from torch.fx.experimental import symbolic_shapes
+from torch.fx.traceback import NodeSource
 
 log: logging.Logger = logging.getLogger(__name__)
 
@@ -141,8 +142,24 @@ def serialize_metadata(self, node: torch.fx.Node) -> Dict[str, str]:
             debug_handle = node.meta["debug_handle"]
             meta["debug_handle"] = str(debug_handle)
 
+        if "from_node" in node.meta:
+            from_node = node.meta["from_node"]
+            # Serialize from_node as JSON since it's a complex nested structure
+            meta["from_node"] = json.dumps(self._make_from_node_json_acceptable(from_node))
+
         return meta
 
+    def _make_from_node_json_acceptable(self, from_node: Optional[List[NodeSource]]):
+        """
+        Serialize from_node metadata from a list of NodeSource objects to a list of dictionaries.
+        """
+        if from_node is None:
+            return None
+
+        json_acceptable_from_node = [node_source.to_dict() for node_source in from_node if isinstance(node_source, NodeSource)]
+
+        return json_acceptable_from_node
+
     def serialize_alloc_inputs(
         self, inputs  # pyre-ignore
     ) -> List[schema.NamedArgument]:
@@ -473,8 +490,22 @@ def deserialize_metadata(self, metadata: Dict[str, str]) -> Dict[str, Any]:
         if debug_handle := metadata.get("debug_handle"):
             res["debug_handle"] = int(debug_handle)
 
+        if from_node_str := metadata.get("from_node"):
+            res["from_node"] = self._deserialize_from_node(json.loads(from_node_str))
+
         return res
 
+    def _deserialize_from_node(self, from_node_data: Optional[List[Dict[str, Any]]]) -> Optional[List[NodeSource]]:
+        """
+        Recursively deserialize from_node metadata from JSON data.
+        """
+        if from_node_data is None:
+            return None
+
+        assert isinstance(from_node_data, list)
+
+        return [NodeSource._from_dict(fn_dict) for fn_dict in from_node_data]
+
     # pyre-ignore
     def deserialize_alloc_inputs(self, serialized_inputs: List[schema.NamedArgument]):
         def deserialize_alloc_spec(serialized_alloc_spec: str) -> memory.AllocSpec:
diff --git a/exir/tests/test_serde.py b/exir/tests/test_serde.py
@@ -275,3 +275,37 @@ def forward(self, x):
         )
         self.assertEqual(metadata[0], metadata_serde[0])
         self.assertEqual(list(metadata[1].keys()), list(metadata_serde[1].keys()))
+
+    def test_meta_debug_handle_and_from_node(self) -> None:
+        class Model(nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.conv_layer = nn.Conv2d(
+                    in_channels=1, out_channels=64, kernel_size=3, padding=1
+                )
+
+            def forward(self, x):
+                return self.conv_layer(x)
+
+        m = Model()
+        inputs = (torch.randn(1, 1, 32, 32),)
+
+        edge = to_edge(export(m, inputs, strict=True))
+        edge_new = deserialize(serialize(edge.exported_program()))
+        for node, node_new in zip(
+            edge.exported_program().graph_module.graph.nodes,
+            edge_new.graph_module.graph.nodes,
+        ):
+            if node.op not in {"placeholder", "output"}:
+                self.assertIsNotNone(node.meta.get("debug_handle"))
+                self.assertIsNotNone(node.meta.get("from_node"))
+                self.assertEqual(
+                    node.meta.get("debug_handle"), node_new.meta.get("debug_handle")
+                )
+                self.assertEqual(
+                    len(node.meta.get("from_node")), len(node_new.meta.get("from_node"))
+                )
+                for node_source, node_source_new in zip(
+                    node.meta.get("from_node"), node_new.meta.get("from_node")
+                ):
+                    self.assertEqual(node_source.to_dict(), node_source_new.to_dict())
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
@@ -73,10 +73,16 @@ class BaseConfig:
             if it is a Llama model or the weights will be downloaded from HuggingFace
             if it is a non-Llama model.
         checkpoint_dir: Path to directory containing sharded checkpoint files.
+        adapter_checkpoint: Path to the adapter.pt file from torchtune. Used if
+            the model has trained LoRA adapters. Must provide
+            adapter_config.json.
+        adapter_config: Path to the adapter_config.json file from torchtune.
+            Used if the model has trained LoRA adapters. Must provide adapter.pt.
         tokenizer_path: Path to the tokenizer file.
         metadata: Json string containing metadata information.
             e.g. '"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
-        use_lora: Rank of the LoRA, if set to 0 then this means no LoRA. For use with QAT.
+        use_lora: Only for use with QAT. Rank of the LoRA adapter, disabled
+            if set to 0.
         fairseq2: For legacy internal use cases, this is safe to ignore.
         preq_mode: Legacy option to specify how prequantized weights are loaded.
             Going forward, ExecuTorch supports loading weights prequantized through
@@ -90,6 +96,8 @@ class BaseConfig:
     params: Optional[str] = None
     checkpoint: Optional[str] = None
     checkpoint_dir: Optional[str] = None
+    adapter_checkpoint: Optional[str] = None
+    adapter_config: Optional[str] = None
     tokenizer_path: Optional[str] = None
     metadata: Optional[str] = None
     use_lora: int = 0
diff --git a/kernels/prim_ops/et_copy_index.cpp b/kernels/prim_ops/et_copy_index.cpp
@@ -86,11 +86,9 @@ void et_copy_index(KernelRuntimeContext& context, EValue** stack) {
     // If we're copying past the first index then the shape of
     // copy_from and copy_to without the leading dimension should be
     // the same. i.e. copy_to.size[1:] == copy_from.size[:].
-    if (index > 0) {
-      ET_CHECK_MSG(
-          copy_to.sizes()[i + 1] == copy_from.sizes()[i],
-          "Mismatch in shape between copy_to and copy_from tensors");
-    }
+    ET_CHECK_MSG(
+        copy_to.sizes()[i + 1] == copy_from.sizes()[i],
+        "Mismatch in shape between copy_to and copy_from tensors");
     expected_output_size[i + 1] = copy_from.sizes()[i];
   }
 
@@ -111,8 +109,17 @@ void et_copy_index(KernelRuntimeContext& context, EValue** stack) {
   // If we've reached here, it means the copy_to tensor has been
   // successfully resized so we can now copy over the data from
   // copy_from into the copy_to tensor.
+
+  // Check that the destination has enough space for the copy.
+  size_t offset = index * size_copy_from;
+  size_t copy_to_size = copy_to.element_size() * copy_to.numel();
+  ET_CHECK_MSG(
+      offset + size_copy_from <= copy_to_size,
+      "Buffer overflow: copy_to tensor is smaller than copy_from tensor.");
+
   memcpy(
-      (void*)((uintptr_t)copy_to_ptr + index * size_copy_from),
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
+      (void*)((uintptr_t)copy_to_ptr + offset),
       copy_from_ptr,
       size_copy_from);
 }
diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp
@@ -215,7 +215,7 @@ TEST_F(RegisterPrimOpsTest, TestETCopyIndex) {
   Tensor copy_to = tf.make({2, 2}, {0, 0, 0, 0});
 #else
   std::vector<int> buf(4);
-  SizesType expected_output_size[2] = {0, 0};
+  SizesType expected_output_size[2] = {0, 2};
   Tensor copy_to =
       tf.make({2, 2}, {0, 0, 0, 0}, {}, TensorShapeDynamism::DYNAMIC_BOUND);
   // Resize the tensor to 0 size for the tests.