pytorch
diff --git a/‎CMakeLists.txt‎
Lines changed: 40 additions & 4 deletions b/‎CMakeLists.txt‎
Lines changed: 40 additions & 4 deletions
diff --git a/‎devtools/bundled_program/test/test_end2end.py‎
Lines changed: 1 addition & 30 deletions b/‎devtools/bundled_program/test/test_end2end.py‎
Lines changed: 1 addition & 30 deletions
diff --git a/‎extension/pybindings/README.md‎
Lines changed: 1 addition & 2 deletions b/‎extension/pybindings/README.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎extension/pybindings/pybindings.cpp‎
Lines changed: 82 additions & 90 deletions b/‎extension/pybindings/pybindings.cpp‎
Lines changed: 82 additions & 90 deletions
diff --git a/‎setup.py‎
Lines changed: 3 additions & 0 deletions b/‎setup.py‎
Lines changed: 3 additions & 0 deletions
@@ -423,6 +423,12 @@ if(MAX_KERNEL_NUM)
   )
 endif()
 
+# Build devtools first if needed - some backends depend on protobuf from
+# devtools
+if(EXECUTORCH_BUILD_DEVTOOLS)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
+endif()
+
 if(EXECUTORCH_BUILD_PYBIND AND APPLE)
   # shared version
   add_library(executorch_core_shared SHARED ${_executorch_core__srcs})
@@ -588,10 +594,6 @@ if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
 endif()
 
-if(EXECUTORCH_BUILD_DEVTOOLS)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
-endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
   list(APPEND _executorch_extensions apple_extension)
@@ -756,6 +758,30 @@ if(EXECUTORCH_BUILD_PYBIND)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
   endif()
 
+  # Create bundled_module target only for pybindings when bundled_program exists
+  # This target has hard dependencies on devtools generated headers
+  if(TARGET bundled_program)
+    add_library(
+      bundled_module STATIC
+      ${CMAKE_CURRENT_SOURCE_DIR}/extension/module/bundled_module.cpp
+    )
+
+    # Ensure bundled_module waits for bundled_program's generated headers
+    add_dependencies(bundled_module bundled_program)
+
+    target_link_libraries(bundled_module PRIVATE extension_data_loader)
+    target_link_libraries(
+      bundled_module PUBLIC extension_module_static bundled_program
+    )
+
+    target_include_directories(
+      bundled_module PUBLIC ${_common_include_directories}
+    )
+    target_compile_options(
+      bundled_module PUBLIC -Wno-deprecated-declarations -fPIC
+    )
+  endif()
+
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
   find_package_torch()
   find_library(
@@ -773,6 +799,16 @@ if(EXECUTORCH_BUILD_PYBIND)
       torch
   )
 
+  if(EXECUTORCH_BUILD_EXTENSION_MODULE)
+    # Always use static linking for pybindings to avoid runtime symbol
+    # resolution issues
+    list(APPEND _dep_libs extension_module_static)
+    # Add bundled_module if available
+    if(TARGET bundled_module)
+      list(APPEND _dep_libs bundled_module)
+    endif()
+  endif()
+
   if(EXECUTORCH_BUILD_TESTS)
     list(APPEND _dep_libs test_backend_compiler_lib)
   endif()
 
@@ -5,21 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 # flake8: noqa: F401
-import functools
-import inspect
-import os
-import random
 import unittest
-from typing import Callable, Dict, Optional, Tuple, Type
-
-import executorch.exir as exir
-
-import executorch.exir.control_flow as control_flow
-
-# @manual=//executorch/extension/pytree:pybindings
-import executorch.extension.pytree as pytree
-
-import torch
 
 from executorch.devtools.bundled_program.core import BundledProgram
 from executorch.devtools.bundled_program.serialize import (
@@ -35,8 +21,6 @@
 try:
     from executorch.extension.pybindings.portable_lib import (
         _load_bundled_program_from_buffer,
-        _load_for_executorch_from_buffer,
-        _load_for_executorch_from_bundled_program,
     )
 
     kernel_mode = "lean"
@@ -47,8 +31,6 @@
 try:
     from executorch.extension.pybindings.aten_lib import (  # @manual=//executorch/extension/pybindings:aten_lib
         _load_bundled_program_from_buffer,
-        _load_for_executorch_from_buffer,
-        _load_for_executorch_from_bundled_program,
     )
 
     assert kernel_mode is None
@@ -75,19 +57,8 @@ def test_sample_model_e2e(self):
             bundled_program_buffer
         )
 
-        executorch_module = _load_for_executorch_from_bundled_program(
-            executorch_bundled_program
-        )
-
         for method_name in eager_model.method_names:
-            executorch_module.load_bundled_input(
-                executorch_bundled_program,
-                method_name,
-                0,
-            )
-            executorch_module.plan_execute(method_name)
-            executorch_module.verify_result_with_bundled_expected_output(
-                executorch_bundled_program,
+            executorch_bundled_program.verify_result_with_bundled_expected_output(
                 method_name,
                 0,
             )
@@ -27,8 +27,6 @@ CMAKE_ARGS="-DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh
 - `_reset_profile_results()`: Reset profile results.
 ## Classes
 ### ExecuTorchModule
-- `load_bundled_input()`: Load bundled input.
-- `verify_result_with_bundled_expected_output(bundle: str, method_name: str, testset_idx: int, rtol: float = 1e-5, atol: float = 1e-8)`: Verify result with bundled expected output.
 - `plan_execute()`: Plan and execute.
 - `run_method()`: Run method.
 - `forward()`: Forward. This takes a pytree-flattend PyTorch-tensor-based input.
@@ -37,5 +35,6 @@ CMAKE_ARGS="-DEXECUTORCH_BUILD_MPS=ON" ./install_executorch.sh
 - `__call__()`: Call method.
 ### BundledModule
 This class is currently empty and serves as a placeholder for future methods and attributes.
+- `verify_result_with_bundled_expected_output(method_name: str, testset_idx: int, rtol: float = 1e-5, atol: float = 1e-8)`: Verify result with bundled expected output.
 ## Note
 All functions and methods are guarded by a call guard that redirects `cout` and `cerr` to the Python environment.
@@ -23,6 +23,7 @@
 #include <executorch/extension/data_loader/buffer_data_loader.h>
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
+#include <executorch/extension/module/bundled_module.h>
 #include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/data_loader.h>
@@ -81,6 +82,7 @@ using ::executorch::ET_RUNTIME_NAMESPACE::Program;
 using ::executorch::extension::BufferDataLoader;
 using ::executorch::extension::MallocMemoryAllocator;
 using ::executorch::extension::MmapDataLoader;
+using ::executorch::extension::ET_BUNDLED_MODULE_NAMESPACE::BundledModule;
 using ::executorch::runtime::ArrayRef;
 using ::executorch::runtime::DataLoader;
 using ::executorch::runtime::Error;
@@ -425,13 +427,54 @@ inline std::unique_ptr<Module> load_module_from_file(
       program_verification);
 }
 
+inline py::list get_outputs_as_py_list(
+    const std::vector<EValue>& outputs,
+    bool clone_outputs = true) {
+  const auto outputs_size = outputs.size();
+  py::list list(outputs_size);
+  for (size_t i = 0; i < outputs_size; ++i) {
+    auto& v = outputs[i];
+    if (Tag::None == v.tag) {
+      list[i] = py::none();
+    } else if (Tag::Int == v.tag) {
+      list[i] = py::cast(v.toInt());
+    } else if (Tag::Double == v.tag) {
+      list[i] = py::cast(v.toDouble());
+    } else if (Tag::Bool == v.tag) {
+      list[i] = py::cast(v.toBool());
+    } else if (Tag::String == v.tag) {
+      list[i] = py::cast(std::string(v.toString().data()));
+    } else if (Tag::Tensor == v.tag) {
+#ifdef USE_ATEN_LIB
+      // Clone so the outputs in python do not share a lifetime with the
+      // module object
+      if (clone_outputs) {
+        list[i] = py::cast(v.toTensor().clone());
+      } else {
+        list[i] = py::cast(v.toTensor());
+      }
+#else
+      if (clone_outputs) {
+        list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()).clone());
+      } else {
+        list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()));
+      }
+#endif
+    } else {
+      ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
+    }
+  }
+  return list;
+}
+
 static constexpr size_t kDEFAULT_BUNDLED_INPUT_POOL_SIZE = 16 * 1024U;
 
-struct PyBundledModule final {
+struct PyBundledModule : public BundledModule {
   explicit PyBundledModule(
       const py::bytes& buffer,
       uint32_t bundled_input_pool_size)
-      : bundled_program_ptr_(buffer),
+      : BundledModule(buffer.cast<std::string_view>().data()),
+        bundled_program_ptr_(buffer),
         program_ptr_(static_cast<const void*>(
             bundled_program_flatbuffer::GetBundledProgram(
                 get_bundled_program_ptr())
@@ -460,6 +503,33 @@ struct PyBundledModule final {
     return program_len_;
   }
 
+  py::list verify_result_with_bundled_expected_output(
+      const std::string& method_name,
+      size_t testset_idx,
+      double rtol = 1e-5,
+      double atol = 1e-8) {
+    // Execute the method
+    auto result = BundledModule::execute(method_name, testset_idx);
+    if (!result.ok()) {
+      THROW_IF_ERROR(
+          result.error(),
+          "Method execution failed with status 0x%" PRIx32,
+          static_cast<uint32_t>(result.error()));
+    }
+
+    // Convert outputs to py::list
+    const auto& outputs = result.get();
+    py::list py_outputs = get_outputs_as_py_list(outputs);
+
+    Error status = BundledModule::verify_method_outputs(
+        method_name, testset_idx, rtol, atol);
+    THROW_IF_ERROR(
+        status,
+        "Result verification failed with status %" PRIu32,
+        static_cast<uint32_t>(status));
+    return py_outputs;
+  }
+
  private:
   // Store the bytes object instead of a raw pointer so that this module will
   // keep the bytes alive.
@@ -853,43 +923,6 @@ struct PyModule final {
     }
   }
 
-  void load_bundled_input(
-      PyBundledModule& m,
-      const std::string method_name,
-      size_t testset_idx) {
-    const void* bundled_program_ptr = m.get_bundled_program_ptr();
-    Error status = executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input(
-        module_->get_method(method_name), bundled_program_ptr, testset_idx);
-    THROW_IF_ERROR(
-        status,
-        "load_bundled_input failed with status 0x%" PRIx32,
-        static_cast<uint32_t>(status));
-  }
-
-  py::list verify_result_with_bundled_expected_output(
-      PyBundledModule& m,
-      const std::string method_name,
-      size_t testset_idx,
-      double rtol = 1e-5,
-      double atol = 1e-8) {
-    const void* bundled_program_ptr = m.get_bundled_program_ptr();
-    auto& method = module_->get_method(method_name);
-    Error status = executorch::BUNDLED_PROGRAM_NAMESPACE::load_bundled_input(
-        method, bundled_program_ptr, testset_idx);
-    THROW_IF_ERROR(
-        status,
-        "load_bundled_input failed with status 0x%" PRIx32,
-        static_cast<uint32_t>(status));
-    py::list outputs = plan_execute(method_name);
-    status = executorch::BUNDLED_PROGRAM_NAMESPACE::verify_method_outputs(
-        method, bundled_program_ptr, testset_idx, rtol, atol);
-    THROW_IF_ERROR(
-        status,
-        "Result verification failed with status %" PRIu32,
-        static_cast<uint32_t>(status));
-    return outputs;
-  }
-
   py::list plan_execute(
       const std::string method_name,
       bool clone_outputs = true) {
@@ -912,46 +945,6 @@ struct PyModule final {
     return get_outputs_as_py_list(outputs, clone_outputs);
   }
 
-  py::list get_outputs_as_py_list(
-      const std::vector<EValue>& outputs,
-      bool clone_outputs = true) {
-    const auto outputs_size = outputs.size();
-    py::list list(outputs_size);
-    for (size_t i = 0; i < outputs_size; ++i) {
-      auto& v = outputs[i];
-      if (Tag::None == v.tag) {
-        list[i] = py::none();
-      } else if (Tag::Int == v.tag) {
-        list[i] = py::cast(v.toInt());
-      } else if (Tag::Double == v.tag) {
-        list[i] = py::cast(v.toDouble());
-      } else if (Tag::Bool == v.tag) {
-        list[i] = py::cast(v.toBool());
-      } else if (Tag::String == v.tag) {
-        list[i] = py::cast(std::string(v.toString().data()));
-      } else if (Tag::Tensor == v.tag) {
-#ifdef USE_ATEN_LIB
-        // Clone so the outputs in python do not share a lifetime with the
-        // module object
-        if (clone_outputs) {
-          list[i] = py::cast(v.toTensor().clone());
-        } else {
-          list[i] = py::cast(v.toTensor());
-        }
-#else
-        if (clone_outputs) {
-          list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()).clone());
-        } else {
-          list[i] = py::cast(alias_attensor_to_etensor(v.toTensor()));
-        }
-#endif
-      } else {
-        ET_ASSERT_UNREACHABLE_MSG("Invalid model output type");
-      }
-    }
-    return list;
-  }
-
   std::unique_ptr<PyMethodMeta> method_meta(const std::string method_name) {
     auto& method = module_->get_method(method_name);
     return std::make_unique<PyMethodMeta>(module_, method.method_meta());
@@ -1583,16 +1576,6 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
       call_guard);
 
   py::class_<PyModule>(m, "ExecuTorchModule")
-      .def("load_bundled_input", &PyModule::load_bundled_input, call_guard)
-      .def(
-          "verify_result_with_bundled_expected_output",
-          &PyModule::verify_result_with_bundled_expected_output,
-          py::arg("bundle"),
-          py::arg("method_name"),
-          py::arg("testset_idx"),
-          py::arg("rtol") = 1e-5,
-          py::arg("atol") = 1e-8,
-          call_guard)
       .def(
           "plan_execute",
           &PyModule::plan_execute,
@@ -1638,7 +1621,16 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
           py::arg("clone_outputs") = true,
           call_guard);
 
-  py::class_<PyBundledModule>(m, "BundledModule");
+  py::class_<PyBundledModule>(m, "BundledModule")
+      .def(
+          "verify_result_with_bundled_expected_output",
+          &PyBundledModule::verify_result_with_bundled_expected_output,
+          py::arg("method_name"),
+          py::arg("testset_idx"),
+          py::arg("rtol") = 1e-5,
+          py::arg("atol") = 1e-8,
+          call_guard);
+
   py::class_<PyTensorInfo>(m, "TensorInfo")
       .def("sizes", &PyTensorInfo::sizes, call_guard)
       .def("dtype", &PyTensorInfo::dtype, call_guard)
 
@@ -731,6 +731,9 @@ def run(self):  # noqa C901
             cmake_build_args += ["--target", "portable_lib"]
             cmake_build_args += ["--target", "selective_build"]
 
+        if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_MODULE"):
+            cmake_build_args += ["--target", "extension_module"]
+
         if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_TRAINING"):
             cmake_build_args += ["--target", "_training_lib"]