pytorch
diff --git a/‎CMakeLists.txt‎
Lines changed: 47 additions & 17 deletions b/‎CMakeLists.txt‎
Lines changed: 47 additions & 17 deletions
diff --git a/‎backends/vulkan/CMakeLists.txt‎
Lines changed: 0 additions & 21 deletions b/‎backends/vulkan/CMakeLists.txt‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎backends/xnnpack/operators/node_visitor.py‎
Lines changed: 4 additions & 3 deletions b/‎backends/xnnpack/operators/node_visitor.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎devtools/etrecord/tests/etrecord_test.py‎
Lines changed: 11 additions & 0 deletions b/‎devtools/etrecord/tests/etrecord_test.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 12 additions & 0 deletions b/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎examples/models/llama/model.py‎
Lines changed: 22 additions & 0 deletions b/‎examples/models/llama/model.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎examples/models/llama/model_args.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/models/llama/model_args.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama/runner/static_attention_io_manager.h‎
Lines changed: 38 additions & 0 deletions b/‎examples/models/llama/runner/static_attention_io_manager.h‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎examples/xnnpack/aot_compiler.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/xnnpack/aot_compiler.py‎
Lines changed: 2 additions & 2 deletions
@@ -380,6 +380,12 @@ add_library(executorch_core ${_executorch_core__srcs})
 # Legacy name alias.
 add_library(executorch_no_prim_ops ALIAS executorch_core)
 
+# A list of all configured backends.
+set(_executorch_backends "")
+
+# A list of all configured extensions.
+set(_executorch_extensions "")
+
 target_link_libraries(executorch_core PRIVATE program_schema)
 if(ANDROID)
   target_link_libraries(executorch_core PUBLIC log)
@@ -524,6 +530,7 @@ install(FILES tools/cmake/executorch-config.cmake
 
 if(EXECUTORCH_BUILD_ARM_BAREMETAL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+  list(APPEND _executorch_backends executorch_delegate_ethos_u)
 endif()
 
 if(EXECUTORCH_BUILD_CADENCE)
@@ -532,30 +539,37 @@ endif()
 
 if(EXECUTORCH_BUILD_NXP_NEUTRON)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/nxp)
+  list(APPEND _executorch_backends executorch_delegate_neutron)
 endif()
 
 if(EXECUTORCH_BUILD_COREML)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
+  list(APPEND _executorch_backends coremldelegate)
 endif()
 
 if(EXECUTORCH_BUILD_MPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
+  list(APPEND _executorch_backends mpsdelegate)
 endif()
 
 if(EXECUTORCH_BUILD_NEURON)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/mediatek)
+  list(APPEND _executorch_backends neuron_backend)
 endif()
 
 if(EXECUTORCH_BUILD_OPENVINO)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/openvino)
+  list(APPEND _executorch_backends openvino_backend)
 endif()
 
 if(EXECUTORCH_BUILD_QNN)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm)
+  list(APPEND _executorch_backends qnn_executorch_backend)
 endif()
 
 if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
+  list(APPEND _executorch_backends xnnpack_backend)
 endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
@@ -568,6 +582,7 @@ endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
+  list(APPEND _executorch_extensions apple_extension)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
@@ -578,6 +593,7 @@ if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
     FILES_MATCHING
     PATTERN "*.h"
   )
+  list(APPEND _executorch_extensions extension_data_loader)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL)
@@ -592,6 +608,7 @@ endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor)
+  list(APPEND _executorch_extensions extension_flat_tensor)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_MODULE)
@@ -602,6 +619,7 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
     FILES_MATCHING
     PATTERN "*.h"
   )
+  list(APPEND _executorch_extensions extension_module_static)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM)
@@ -621,14 +639,17 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM)
         ${ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG}
     )
   endif()
+  list(APPEND _executorch_extensions tokenizers)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
+  list(APPEND _executorch_extensions extension_llm_runner)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
+  list(APPEND _executorch_extensions extension_llm_apple)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
@@ -639,10 +660,12 @@ if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
     FILES_MATCHING
     PATTERN "*.h"
   )
+  list(APPEND _executorch_extensions extension_runner_util)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/tensor)
+  list(APPEND _executorch_extensions extension_tensor)
 endif()
 
 if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
@@ -749,6 +772,7 @@ endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
+  list(APPEND _executorch_extensions extension_training)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_LLM)
@@ -761,10 +785,32 @@ if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
   executorch_target_link_options_shared_lib(quantized_ops_lib)
 endif()
 
+if(EXECUTORCH_BUILD_VULKAN)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
+  list(APPEND _executorch_backends vulkan_backend vulkan_schema)
+endif()
+
+if(EXECUTORCH_BUILD_VGF)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+  list(APPEND _executorch_backends vgf_backend)
+endif()
+
+# Top-level interface targets.
+
+# A target containing all configured backends.
+add_library(executorch_backends INTERFACE)
+add_library(executorch::backends ALIAS executorch_backends)
+target_link_libraries(executorch_backends INTERFACE ${_executorch_backends})
+
+# A target containing all configured extensions.
+add_library(executorch_extensions INTERFACE)
+add_library(executorch::extensions ALIAS executorch_extensions)
+target_link_libraries(executorch_extensions INTERFACE ${_executorch_extensions})
+
 if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   # Baseline libraries that executor_runner will link against.
   set(_executor_runner_libs executorch extension_evalue_util
-                            extension_runner_util gflags
+                            extension_runner_util gflags executorch_backends
   )
 
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
@@ -784,18 +830,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
     list(APPEND _executor_runner_libs $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
   endif()
 
-  if(EXECUTORCH_BUILD_XNNPACK)
-    list(APPEND _executor_runner_libs xnnpack_backend)
-  endif()
-
   if(EXECUTORCH_ENABLE_EVENT_TRACER)
     list(APPEND _executor_runner_libs etdump flatccrt)
   endif()
 
-  if(EXECUTORCH_BUILD_COREML AND APPLE)
-    list(APPEND _executor_runner_libs coremldelegate)
-  endif()
-
   add_executable(executor_runner ${_executor_runner__srcs})
   if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
     target_link_options_gc_sections(executor_runner)
@@ -818,14 +856,6 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   endif()
 endif()
 
-if(EXECUTORCH_BUILD_VULKAN)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
-endif()
-if(EXECUTORCH_BUILD_VGF)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
-endif()
-
-
 if(EXECUTORCH_BUILD_ANDROID_JNI)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/android)
 endif()
 
@@ -122,27 +122,6 @@ executorch_target_link_options_shared_lib(vulkan_backend)
 
 set_property(TARGET vulkan_backend PROPERTY CXX_STANDARD 17)
 
-# Executor Runner
-
-if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
-  set(VULKAN_RUNNER_SRCS ${_executor_runner__srcs})
-  list(TRANSFORM VULKAN_RUNNER_SRCS PREPEND "${EXECUTORCH_ROOT}/")
-
-  set(VGF_BACKEND )
-  if(EXECUTORCH_BUILD_VGF)
-  set(VGF_BACKEND vgf_backend)
-  endif()
-
-  add_executable(vulkan_executor_runner ${VULKAN_RUNNER_SRCS})
-  target_link_libraries(
-    vulkan_executor_runner ${_executor_runner_libs} vulkan_schema
-    vulkan_backend
-    ${VGF_BACKEND}
-  )
-
-  target_compile_options(vulkan_executor_runner PUBLIC ${VULKAN_CXX_FLAGS})
-endif()
-
 # Test targets
 
 install(
 
@@ -622,9 +622,10 @@ def get_serialized_buffer_index(
         )
 
         external_tag = tensor.meta.get("delegate_constant_tag", None)
-        logging.info(
-            f"Adding constant data with name {tensor.name}, key {named_key} and external_tag {external_tag} to named_data_store"
-        )
+        if external_tag is not None:
+            logging.info(
+                f"Adding constant data with name {tensor.name}, key {named_key} and external_tag {external_tag} to named_data_store"
+            )
         self._named_data_store.add_named_data(
             named_key,
             bytes(array),
 
@@ -92,6 +92,17 @@ def check_graph_closeness(self, graph_a, graph_b):
                 self.assertEqual(
                     node_a.meta.get("debug_handle"), node_b.meta.get("debug_handle")
                 )
+                from_node_a = node_a.meta.get("from_node")
+                from_node_b = node_b.meta.get("from_node")
+
+                if from_node_a is None:
+                    self.assertIsNone(from_node_b)
+                else:
+                    self.assertIsNotNone(from_node_b)
+                    for node_source_a, node_source_b in zip(from_node_a, from_node_b):
+                        self.assertEqual(
+                            node_source_a.to_dict(), node_source_b.to_dict()
+                        )
 
     def test_etrecord_generation(self):
         captured_output, edge_output, et_output = self.get_test_model()
 
@@ -239,6 +239,18 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="checkpoint directory. Use with a sharded checkpoint, not for the standard llama2 model. Note, checkpoint_dir takes precedence over checkpoint if both are set.",
     )
 
+    parser.add_argument(
+        "--adapter_checkpoint",
+        required=False,
+        help="Path to the adapter.pt file from torchtune. Used if the model has trained LoRA adapters. Must provide adapter_config.json",
+    )
+
+    parser.add_argument(
+        "--adapter_config",
+        required=False,
+        help="Path to the adapter_config.json file. Used if the model has trained LoRA adapters. Must provide adapter_checkpoint.",
+    )
+
     parser.add_argument(
         "--use_qnn_sha",
         action="store_true",
 
@@ -46,6 +46,13 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
         checkpoint_dir = self.llm_config.base.checkpoint_dir
         params_path = self.llm_config.base.params
 
+        # Adapter checkpoint and config.
+        adapter_checkpoint_path = self.llm_config.base.adapter_checkpoint
+        adapter_config_path = self.llm_config.base.adapter_config
+        assert (adapter_checkpoint_path is None and adapter_config_path is None) or (
+            adapter_checkpoint_path is not None and adapter_config_path is not None
+        ), "Both adapter_checkpoint_path and adapter_config_path must be specified or neither must be specified."
+
         self.use_kv_cache = self.llm_config.model.use_kv_cache
         self.use_sdpa_with_kv_cache_op = self.llm_config.model.use_sdpa_with_kv_cache
         self.generate_full_logits = self.llm_config.debug.generate_full_logits
@@ -129,6 +136,20 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
             with open(params_path, "r") as f:
                 params = json.loads(f.read())
 
+        # Get adapter checkpoint and config.
+        adapter_checkpoint = {}
+        adapter_config = {}
+        if adapter_checkpoint_path:
+            adapter_checkpoint = torch.load(
+                adapter_checkpoint_path, map_location=device, mmap=True
+            )
+            from torchtune.models import convert_weights
+
+            adapter_checkpoint = convert_weights.tune_to_meta(adapter_checkpoint)
+            with open(adapter_config_path, "r") as f:
+                adapter_config = json.loads(f.read())
+            checkpoint.update(adapter_checkpoint)
+
         output_prune_map = None
         if self.output_prune_map_path is not None:
             with open(self.output_prune_map_path, "r") as f:
@@ -153,6 +174,7 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
             output_prune_map=output_prune_map,
             enable_dynamic_shape=self.enable_dynamic_shape,
             **params,
+            **adapter_config,
         )
 
         if model_args.use_scaled_rope:
 
@@ -59,7 +59,7 @@ class ModelArgs:
     lora_args: Optional[dict] = None
 
     # LoRA arguments to set up a LoRA inference model.
-    # These arguments come directly from a torchtune LoRA config.
+    # These arguments come directly from a torchtune adapter_config.json file.
     r: Optional[int] = None  # Rank.
     lora_alpha: Optional[int] = None  # Alpha.
     # Eg. q_proj, k_proj, v_proj, output_proj
 
@@ -602,6 +602,39 @@ class StaticAttentionIOManager {
     }
   }
 
+  /**
+   * Prefill helper. Run multiple inferences as needed depending on the length
+   * of the prompt and method's input length. Returns the position in the output
+   * that corresponds to the end of the prompt during the last inference.
+   */
+  template <typename TokenT>
+  size_t prefill(
+      executorch::runtime::Span<TokenT> tokens,
+      executorch::runtime::Span<TokenT> input_buffer,
+      executorch::runtime::Method& method) {
+    size_t input_len = input_buffer.size();
+    get_mask(input_buffer.size()).set_causal_mask();
+
+    size_t batch_len = 0;
+    for (size_t i = 0; i < tokens.size(); i += input_len) {
+      batch_len = std::min(input_len, tokens.size() - i);
+      std::copy(&tokens[i], &tokens[i + batch_len], input_buffer.begin());
+      prepare(method);
+      ET_CHECK(method.execute() == executorch::runtime::Error::Ok);
+      update(
+          method,
+          config_.k_cache_output_indices,
+          config_.v_cache_output_indices,
+          batch_len);
+    }
+    return batch_len - 1;
+  }
+
+  /**
+   * Decode helper. The `sample` argument is called after each inference and
+   * should retrieve the logits from the `method` argument's output and return
+   * the sampled token.
+   */
   template <typename TokenT>
   std::vector<TokenT> decode(
       TokenT prev_tok,
@@ -632,6 +665,11 @@ class StaticAttentionIOManager {
     return generated_tokens;
   }
 
+  /**
+   * Lookahead decode helper. The `sample` argument is called after each
+   * inference and should retrieve the logits from the `method` argument's
+   * output and return the sampled token for all output positions.
+   */
   template <typename TokenT>
   std::vector<TokenT> lookahead_decode(
       TokenT prev_tok,
 
@@ -87,14 +87,14 @@
 
     model = model.eval()
     # pre-autograd export. eventually this will become torch.export
-    ep = torch.export.export_for_training(model, example_inputs, strict=True)
+    ep = torch.export.export_for_training(model, example_inputs, strict=False)
     model = ep.module()
 
     if args.quantize:
         logging.info("Quantizing Model...")
         # TODO(T165162973): This pass shall eventually be folded into quantizer
         model = quantize(model, example_inputs, quant_type)
-        ep = torch.export.export_for_training(model, example_inputs, strict=True)
+        ep = torch.export.export_for_training(model, example_inputs, strict=False)
 
     edge = to_edge_transform_and_lower(
         ep,