Merge remote-tracking branch 'origin/main' into use-executorch-core

kirklandsign · kirklandsign · commit 7ee03fc1d2da · 2025-05-06T14:14:44.000-07:00
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -13,6 +13,7 @@
 import torch.nn.functional as F
 
 from executorch.examples.models.llama.attention import (
+    Attention,
     ATTENTION_REGISTRY,
     ForwardOptions,
 )
@@ -83,26 +84,46 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
+    def __init__(self, args: ModelArgs, attention: Attention):
+        """
+        Transformer block with support for pre-norm and post-norm.
+        Args:
+            args (ModelArgs): model configuration parameters.
+            attention (Attention): attention object to use in the transformer
+                block. See `attention.py` for types of attention. Make sure
+                the attention type is registered in the ATTENTION_REGISTRY.
+        """
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
         self.n_heads = args.n_heads
         self.dim = args.dim
         self.head_dim = args.head_dim
-        if args.attention_type not in ATTENTION_REGISTRY:
-            raise ValueError(
-                f"Unknown attention type: {args.attention_type}. "
-                f"Available: {list(ATTENTION_REGISTRY.keys())}"
-            )
-        cls = ATTENTION_REGISTRY[args.attention_type]
-        self.attention = cls(args, layer_id, rope)
+        self.attention = attention
         if args.moe:
             self.block_sparse_moe = MOEFeedForward(args)
         else:
             self.feed_forward = FeedForward(args)
         self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
 
+    @classmethod
+    def from_type(cls, layer_id, args, rope) -> "TransformerBlock":
+        """
+        Create a TransformerBlock with the legacy constructor.
+        Args:
+            layer_id (int): the index of the layer.
+            args (ModelArgs): model configuration parameters.
+            rope (Rope): the rope object to use for rotary embeddings.
+        """
+        if args.attention_type not in ATTENTION_REGISTRY:
+            raise ValueError(
+                f"Unknown attention type: {args.attention_type}. "
+                f"Available: {list(ATTENTION_REGISTRY.keys())}"
+            )
+        cls = ATTENTION_REGISTRY[args.attention_type]
+        attention = cls(args, layer_id, rope)
+        return TransformerBlock(args, attention)
+
     def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x: 1xN
         h, attn_options_update = self.attention.forward(
             self.attention_norm(x), freqs_cos, freqs_sin, **attn_options
@@ -117,7 +138,15 @@ def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x:
 
 
 class Transformer(nn.Module):
-    def __init__(self, params: ModelArgs):
+    def __init__(self, params: ModelArgs, layers: nn.ModuleList, rope: Rope):
+        """
+        Transformer model.
+        Args:
+            params (ModelArgs): model configuration parameters.
+            layers (nn.ModuleList): list of transformer blocks - see the
+                `TransformerBlock` type above.
+            rope (Rope): the rope object to use for rotary embeddings.
+        """
         super().__init__()
         self.params = params
         self.vocab_size = params.vocab_size
@@ -130,10 +159,8 @@ def __init__(self, params: ModelArgs):
             if self.apply_embedding
             else None
         )
-        self.rope = Rope(params)
-        self.layers = torch.nn.ModuleList()
-        for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params, self.rope))
+        self.layers = layers
+        self.rope = rope
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
         self.output = (
             nn.Linear(params.dim, params.vocab_size, bias=False)
@@ -212,3 +239,23 @@ def forward(
             return logits, attn_options_update
 
         return logits
+
+
+def construct_transformer(model_args: ModelArgs) -> Transformer:
+    """
+    Construct a Transformer model from the given model arguments.
+    """
+    rope = Rope(model_args)
+    if model_args.attention_type not in ATTENTION_REGISTRY:
+        raise ValueError(
+            f"Unknown attention type: {model_args.attention_type}. "
+            f"Available: {list(ATTENTION_REGISTRY.keys())}"
+        )
+    layers = torch.nn.ModuleList()
+    cls = ATTENTION_REGISTRY[model_args.attention_type]
+    for layer_id in range(model_args.n_layers):
+        attention = cls(model_args, layer_id, rope)
+        transformer_block = TransformerBlock(model_args, attention)
+        layers.append(transformer_block)
+
+    return Transformer(model_args, layers, rope)
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -15,9 +15,10 @@
     get_checkpoint_dtype,
     get_default_model_resource_dir,
 )
-from executorch.examples.models.llama.llama_transformer import Transformer
 
+from executorch.examples.models.llama.llama_transformer import construct_transformer
 from executorch.examples.models.llama.model_args import ModelArgs
+from executorch.examples.models.llama.rope import Rope
 from torchao.utils import TorchAOBaseTensor
 
 try:
@@ -174,7 +175,7 @@ def __init__(self, **kwargs):
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
         with torch.device("meta"):
             # Model itself is loaded in default dtype, fp32.
-            self.model_ = Transformer(model_args)
+            self.model_ = construct_transformer(model_args)
             # Get checkpoint dtype.
             if checkpoint:
                 self.model_.checkpoint_dtype = get_checkpoint_dtype(checkpoint)
diff --git a/examples/models/llama/tests/test_pre_quantization_transforms.py b/examples/models/llama/tests/test_pre_quantization_transforms.py
@@ -7,7 +7,10 @@
 import unittest
 
 import torch
-from executorch.examples.models.llama.llama_transformer import Transformer
+from executorch.examples.models.llama.llama_transformer import (
+    construct_transformer,
+    Transformer,
+)
 from executorch.examples.models.llama.model_args import ModelArgs
 from executorch.examples.models.llama.source_transformation.pre_quantization import (
     sanitize_checkpoint_from_pre_quantization,
@@ -39,7 +42,7 @@ def _prepare_dummy_model(self) -> Transformer:
             vocab_size=32000,
         )
 
-        model = Transformer(model_args)
+        model = construct_transformer(model_args)
 
         return model
 
diff --git a/examples/models/llama/tests/test_static_attention.py b/examples/models/llama/tests/test_static_attention.py
@@ -2,7 +2,7 @@
 
 import torch
 from executorch.examples.models.llama.attention import AttentionMHA, ForwardOptions
-from executorch.examples.models.llama.llama_transformer import Transformer
+from executorch.examples.models.llama.llama_transformer import construct_transformer
 from executorch.examples.models.llama.model_args import ModelArgs
 from executorch.examples.models.llama.rope import Rope
 from executorch.examples.models.llama.static_attention import (
@@ -160,10 +160,10 @@ def test_within_transformer(self):
             n_layers=4,
             vocab_size=128,
         )
-        mha_transformer = Transformer(config).eval()
+        mha_transformer = construct_transformer(config).eval()
 
         config.attention_type = "static"
-        static_transformer = Transformer(config).eval()
+        static_transformer = construct_transformer(config).eval()
         static_transformer.load_state_dict(mha_transformer.state_dict(), strict=False)
         for mha_layer, static_layer in zip(
             mha_transformer.layers, static_transformer.layers
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
@@ -12,7 +12,7 @@
 
 import requests
 import torch
-from executorch.examples.models.llama.llama_transformer import Transformer
+from executorch.examples.models.llama.llama_transformer import construct_transformer
 from executorch.examples.models.llama.model_args import ModelArgs
 
 from executorch.examples.models.llama.source_transformation.custom_kv_cache import (
@@ -66,7 +66,7 @@ def __init__(
             use_hf_rope=True,
             max_seq_len=max_seq_len,
         )
-        self.text_model = Transformer(self.text_model_args)
+        self.text_model = construct_transformer(self.text_model_args)
         # use custom op for SDPA.
         if use_sdpa_with_kv_cache_op:
             self.text_model = replace_kv_cache_with_custom_kv_cache(self.text_model)
diff --git a/exir/backend/test/demos/rpc/ExecutorBackend.cpp b/exir/backend/test/demos/rpc/ExecutorBackend.cpp
@@ -18,6 +18,7 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/core/named_data_map.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/executor/program.h>
 
@@ -37,6 +38,7 @@ using ::executorch::runtime::MemoryAllocator;
 using ::executorch::runtime::MemoryManager;
 using ::executorch::runtime::Method;
 using ::executorch::runtime::MethodMeta;
+using ::executorch::runtime::NamedDataMap;
 using ::executorch::runtime::Program;
 using ::executorch::runtime::Result;
 using ::executorch::runtime::Span;
@@ -156,9 +158,13 @@ class ExecutorBackend final : public ::executorch::runtime::BackendInterface {
     new (client_memory_manager)
         MemoryManager(client_method_allocator, client_planned_memory);
 
+    const NamedDataMap* named_data_map = context.get_named_data_map();
     // Construct the client Method
-    Result<Method> method_res =
-        client_program->load_method("forward", client_memory_manager);
+    Result<Method> method_res = client_program->load_method(
+        "forward",
+        client_memory_manager,
+        /*event_tracer=*/nullptr,
+        named_data_map);
     if (!method_res.ok()) {
       ET_LOG(
           Error,
diff --git a/exir/backend/test/demos/rpc/TARGETS b/exir/backend/test/demos/rpc/TARGETS
@@ -11,6 +11,7 @@ runtime.python_library(
     ],
     visibility = [
         "//executorch/exir/backend/test/...",
+        "//executorch/test/...",
     ],
     deps = [
         "//caffe2:torch",
diff --git a/exir/backend/test/demos/rpc/executor_backend_preprocess.py b/exir/backend/test/demos/rpc/executor_backend_preprocess.py
@@ -8,6 +8,8 @@
 
 from typing import final, List
 
+from executorch.exir import ExecutorchBackendConfig
+
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
@@ -24,10 +26,14 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
+        config = ExecutorchBackendConfig()
+        for spec in compile_specs:
+            if spec.key == "external_constants":
+                config.external_constants = True
         return PreprocessResult(
             processed_bytes=EdgeProgramManager(
                 edge_programs=edge_program,
             )
-            .to_executorch()
+            .to_executorch(config)
             .buffer,
         )
diff --git a/exir/backend/test/demos/rpc/targets.bzl b/exir/backend/test/demos/rpc/targets.bzl
@@ -40,6 +40,7 @@ def define_common_targets():
         ],
         visibility = [
             "//executorch/exir/backend/test/...",
+            "//executorch/runtime/executor/test/...",
         ],
         deps = [
             ":executor_backend",
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
@@ -329,6 +329,8 @@ Result<size_t> Method::get_num_external_constants() {
 }
 
 Error Method::parse_external_constants(const NamedDataMap* named_data_map) {
+  ET_CHECK_OR_RETURN_ERROR(
+      named_data_map != nullptr, InvalidState, "named_data_map is null");
   auto flatbuffer_values = serialization_plan_->values();
   size_t n_value = flatbuffer_values->size();
 
@@ -372,6 +374,7 @@ Error Method::parse_external_constants(const NamedDataMap* named_data_map) {
     Result<const TensorLayout> tensor_layout =
         named_data_map->get_metadata(key);
     if (!tensor_layout.ok()) {
+      ET_LOG(Info, "Failed to get metadata for key %s", key);
       return tensor_layout.error();
     }
     // Check external tensor compatibility.
diff --git a/runtime/executor/test/backend_data_separation_test.cpp b/runtime/executor/test/backend_data_separation_test.cpp
@@ -0,0 +1,101 @@
+
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/exir/backend/test/demos/rpc/ExecutorBackend.h>
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/executor/test/managed_memory_manager.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::extension::FlatTensorDataMap;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::Method;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::testing::ManagedMemoryManager;
+using torch::executor::util::FileDataLoader;
+
+constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
+constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024U;
+
+class BackendDataSeparationTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Make sure that the backend has been registered. Safe to call multiple
+    // times. Doing this at runtime ensures that it's only registered if these
+    // tests are run.
+    ASSERT_EQ(example::register_executor_backend(), Error::Ok);
+
+    // Create data loaders.
+    Result<FileDataLoader> linear_program_loader = FileDataLoader::from(
+        std::getenv("ET_MODULE_LINEAR_DELEGATE_PROGRAM_PATH"));
+    ASSERT_EQ(linear_program_loader.error(), Error::Ok);
+    linear_program_loader_ = std::make_unique<FileDataLoader>(
+        std::move(linear_program_loader.get()));
+
+    Result<FileDataLoader> linear_data_loader =
+        FileDataLoader::from(std::getenv("ET_MODULE_LINEAR_DATA_PATH"));
+    ASSERT_EQ(linear_data_loader.error(), Error::Ok);
+    linear_data_loader_ =
+        std::make_unique<FileDataLoader>(std::move(linear_data_loader.get()));
+
+    // Create programs.
+    Result<Program> linear_program = Program::load(
+        linear_program_loader_.get(),
+        Program::Verification::InternalConsistency);
+    ASSERT_EQ(linear_program.error(), Error::Ok);
+    linear_program_ =
+        std::make_unique<Program>(std::move(linear_program.get()));
+
+    Result<FlatTensorDataMap> linear_data_map =
+        FlatTensorDataMap::load(linear_data_loader_.get());
+    EXPECT_EQ(linear_data_map.error(), Error::Ok);
+    linear_data_map_ =
+        std::make_unique<FlatTensorDataMap>(std::move(linear_data_map.get()));
+
+    ET_LOG(
+        Info,
+        "setup done, named_data_map_ = %lu",
+        linear_data_map_->get_num_keys().get());
+  }
+
+ private:
+  std::unique_ptr<FileDataLoader> linear_program_loader_;
+  std::unique_ptr<FileDataLoader> linear_data_loader_;
+
+ protected:
+  std::unique_ptr<Program> linear_program_;
+  std::unique_ptr<FlatTensorDataMap> linear_data_map_;
+};
+
+TEST_F(BackendDataSeparationTest, TestSeparation) {
+  ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+  Result<Method> method = linear_program_->load_method(
+      "forward",
+      &mmm.get(),
+      /*event_tracer=*/nullptr,
+      /*named_data_map=*/linear_data_map_.get());
+  ASSERT_EQ(method.error(), Error::Ok);
+
+  // Can execute the method.
+  Error err = method->execute();
+  ASSERT_EQ(err, Error::Ok);
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
diff --git a/test/models/export_delegated_program.py b/test/models/export_delegated_program.py
diff --git a/test/models/export_program.py b/test/models/export_program.py
diff --git a/test/models/targets.bzl b/test/models/targets.bzl

Original file line number	Diff line number	Diff line change
`@@ -329,6 +329,8 @@ Result<size_t> Method::get_num_external_constants() {`
`329`	`329`	`}`
`330`	`330`
`331`	`331`	`Error Method::parse_external_constants(const NamedDataMap* named_data_map) {`
	`332`	`+ ET_CHECK_OR_RETURN_ERROR(`
	`333`	`+ named_data_map != nullptr, InvalidState, "named_data_map is null");`
`332`	`334`	`auto flatbuffer_values = serialization_plan_->values();`
`333`	`335`	`size_t n_value = flatbuffer_values->size();`
`334`	`336`
`@@ -372,6 +374,7 @@ Error Method::parse_external_constants(const NamedDataMap* named_data_map) {`
`372`	`374`	`Result<const TensorLayout> tensor_layout =`
`373`	`375`	`named_data_map->get_metadata(key);`
`374`	`376`	`if (!tensor_layout.ok()) {`
	`377`	`+ ET_LOG(Info, "Failed to get metadata for key %s", key);`
`375`	`378`	`return tensor_layout.error();`
`376`	`379`	`}`
`377`	`380`	`// Check external tensor compatibility.`