training module takes .ptd

JacobSzwejbka · facebook-github-bot · commit f0baf1dd9d1e · 2025-02-26T10:47:32.000-08:00
Summary:
Allow TrainingModule to take in a .ptd. Also realized I was only caching the gradient tensors not the params so went ahead and fixed that.

Updated the export script to generate training modules with separated weights. 

Fixed a bug in tensor parsing for external mutable tensors.

Differential Revision: D69547105
diff --git a/extension/training/module/test/targets.bzl b/extension/training/module/test/targets.bzl
@@ -17,6 +17,8 @@ def define_common_targets(is_fbcode = False):
             # intentionally don't work in xplat (since they're host-only tools).
             "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
             "ET_MODULE_SIMPLE_TRAIN_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleSimpleTrain.pte])",
+            "ET_MODULE_TRAIN_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleSimpleTrainProgram.pte])",
+            "ET_MODULE_TRAIN_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleSimpleTrain.ptd])",
         }
 
         runtime.cxx_test(
@@ -28,6 +30,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/extension/training/module:training_module",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map",
                 "//executorch/kernels/portable:generated_lib",
             ],
             env = modules_env,
diff --git a/extension/training/module/test/training_module_test.cpp b/extension/training/module/test/training_module_test.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/training/module/training_module.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/platform/runtime.h>
@@ -21,6 +22,14 @@ using executorch::aten::Tensor;
 using torch::executor::Error;
 using torch::executor::Span;
 using torch::executor::testing::TensorFactory;
+using executorch::extension::FlatTensorDataMap;
+using executorch::extension::FlatTensorHeader;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+using executorch::runtime::TensorLayout;
+using torch::executor::util::FileDataLoader;
 
 class TrainingModuleTest : public ::testing::Test {
  protected:
@@ -105,3 +114,36 @@ TEST_F(TrainingModuleTest, NonTrainingModuleTest) {
   auto res = mod.execute_forward_backward("forward", inputs);
   ASSERT_EQ(res.error(), Error::InvalidArgument);
 }
+
+TEST_F(TrainingModuleTest, SeperateDataTest) {
+  // Load data map.
+  // The eager linear model is defined at:
+  // //executorch/test/models/linear_model.py
+  const char* ptd_path = std::getenv("ET_MODULE_TRAIN_DATA_PATH");
+  Result<FileDataLoader> data_map_loader_res = FileDataLoader::from(ptd_path);
+  ASSERT_EQ(data_map_loader_res.error(), Error::Ok);
+
+  auto data_map_loader = std::make_unique<torch::executor::util::FileDataLoader>(
+    std::move(data_map_loader_res.get()));
+
+  const char* pte_path = std::getenv("ET_MODULE_TRAIN_PROGRAM_PATH");
+  Result<FileDataLoader> pte_loader_res = FileDataLoader::from(pte_path);
+  ASSERT_EQ(pte_loader_res.error(), Error::Ok);
+
+  auto pte_loader = std::make_unique<torch::executor::util::FileDataLoader>(
+    std::move(pte_loader_res.get()));
+
+  auto mod = executorch::extension::training::TrainingModule(std::move(pte_loader), nullptr, nullptr, nullptr, std::move(data_map_loader));
+
+  TensorFactory<ScalarType::Float> tf;
+  Tensor input = tf.make({3}, {1.0, 1.0, 1.0});
+  Tensor label = tf.make({3}, {1.0, 0.0, 0.0});
+
+  std::vector<executorch::runtime::EValue> inputs;
+  inputs.push_back(input);
+  inputs.push_back(label);
+
+  auto res = mod.execute_forward_backward("forward", inputs);
+  ASSERT_EQ(res.error(), Error::Ok);
+  ASSERT_EQ(res.get().size(), 1);
+}
diff --git a/extension/training/module/training_module.cpp b/extension/training/module/training_module.cpp
@@ -43,7 +43,6 @@ TrainingModule::execute_forward_backward(
   uint64_t param_start = param_res.get()[0].toInt();
 
   // Execute the forward and backward pass.
-
   auto outputs = torch::executor::Module::execute(method_name, input);
   if (!outputs.ok()) {
     return outputs.error();
@@ -56,19 +55,22 @@ TrainingModule::execute_forward_backward(
     user_outputs.push_back(outputs.get().at(i));
   }
 
-  // Extract and store the gradients.
+  // Extract and store the gradients and params if this is the first time seeing this method.
   if (method_named_gradients_.find(method_name) ==
       method_named_gradients_.end()) {
+    // Fully qualified names
+    std::vector<runtime::EValue> fqn_list;
     method_named_gradients_.insert({method_name, {}});
 
     auto& gradients_map = method_named_gradients_.at(method_name);
-    // Get names.
+  
+    // Get names if we havent seen this method before.
     const std::string fqn_method_name = fqn_method_prefix + method_name;
     auto fqn_res = executorch::extension::Module::execute(fqn_method_name);
     if (!fqn_res.ok()) {
       return fqn_res.error();
     }
-    const auto& fqn_list = fqn_res.get();
+    fqn_list = fqn_res.get();
 
     // Only have to initialize the dict once because the tensors in the dict and
     // the tensors in the method alias the same TensorImpl, so updating one will
@@ -87,43 +89,48 @@ TrainingModule::execute_forward_backward(
 runtime::Result<
     const std::map<executorch::aten::string_view, executorch::aten::Tensor>>
 TrainingModule::named_parameters(const std::string& method_name) {
-  std::map<executorch::aten::string_view, executorch::aten::Tensor>
-      named_parameters;
-  const std::string fqn_method_name = fqn_method_prefix + method_name;
-  const std::string parameters_method_name =
-      parameters_method_prefix + method_name;
+  // If we haven't seen this method before, populate the dict.
+  if (method_named_parameters_.find(method_name) ==
+      method_named_parameters_.end()) {
+    const std::string fqn_method_name = fqn_method_prefix + method_name;
+    const std::string parameters_method_name =
+        parameters_method_prefix + method_name;
 
-  // get names.
-  auto fqn_res = executorch::extension::Module::execute(fqn_method_name);
-  if (!fqn_res.ok()) {
-    return fqn_res.error();
-  }
-  const auto& fqn_list = fqn_res.get();
+    method_named_parameters_.insert({method_name, {}});
 
-  // get params start.
-  auto param_res =
-      executorch::extension::Module::execute(parameters_method_name);
-  if (!param_res.ok()) {
-    return param_res.error();
-  }
+    // get names.
+    auto fqn_res = executorch::extension::Module::execute(fqn_method_name);
+    if (!fqn_res.ok()) {
+      return fqn_res.error();
+    }
+    const auto& fqn_list = fqn_res.get();
 
-  uint64_t param_start = param_res.get()[0].toInt();
+    // get params start.
+    auto param_res =
+        executorch::extension::Module::execute(parameters_method_name);
+    if (!param_res.ok()) {
+      return param_res.error();
+    }
 
-  auto e = executorch::extension::Module::load_method(method_name);
-  if (e != runtime::Error::Ok) {
-    return e;
-  }
-  auto& method = methods_.at(method_name).method;
-
-  // create dict
-  size_t name_index = 0;
-  for (size_t param_index = param_start; param_index < method->outputs_size();
-       ++param_index, ++name_index) {
-    executorch::aten::string_view fqn = fqn_list.at(name_index).toString();
-    executorch::aten::Tensor param = method->get_output(param_index).toTensor();
-    named_parameters.insert({fqn, param});
+    uint64_t param_start = param_res.get()[0].toInt();
+
+    // Load the method if it is not already loaded.
+    auto e = executorch::extension::Module::load_method(method_name);
+    if (e != runtime::Error::Ok) {
+      return e;
+    }
+    auto& method = methods_.at(method_name).method;
+
+    // populate dict
+    size_t name_index = 0;
+    for (size_t param_index = param_start; param_index < method->outputs_size();
+        ++param_index, ++name_index) {
+      executorch::aten::string_view fqn = fqn_list.at(name_index).toString();
+      executorch::aten::Tensor param = method->get_output(param_index).toTensor();
+      method_named_parameters_.at(method_name).insert({fqn, param});
+    }
   }
-  return named_parameters;
+  return method_named_parameters_.at(method_name);
 }
 
 runtime::Result<
diff --git a/extension/training/module/training_module.h b/extension/training/module/training_module.h
@@ -33,13 +33,16 @@ class ET_EXPERIMENTAL TrainingModule final
       std::unique_ptr<runtime::DataLoader> data_loader,
       std::unique_ptr<runtime::MemoryAllocator> memory_allocator = nullptr,
       std::unique_ptr<runtime::MemoryAllocator> temp_allocator = nullptr,
-      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr)
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr,
+      std::unique_ptr<runtime::DataLoader> data_map_data_loader = nullptr)
       : executorch::extension::Module(
             std::move(data_loader),
             std::move(memory_allocator),
             std::move(temp_allocator),
-            std::move(event_tracer)),
-        method_named_gradients_({}) {}
+            std::move(event_tracer),
+            std::move(data_map_data_loader)),
+        method_named_gradients_({}),
+        method_named_parameters_({}) {}
 
   explicit TrainingModule(const Module&) = delete;
   TrainingModule& operator=(const Module&) = delete;
@@ -97,6 +100,11 @@ class ET_EXPERIMENTAL TrainingModule final
       std::string,
       std::map<executorch::aten::string_view, executorch::aten::Tensor>>
       method_named_gradients_;
+
+  std::unordered_map<
+      std::string,
+      std::map<executorch::aten::string_view, executorch::aten::Tensor>>
+      method_named_parameters_;
 };
 
 } // namespace training
diff --git a/runtime/executor/tensor_parser_exec_aten.cpp b/runtime/executor/tensor_parser_exec_aten.cpp
@@ -169,23 +169,8 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
   const executorch_flatbuffer::AllocationDetails* allocation_info =
       s_tensor->allocation_info();
 
-  // Memory Planned, with initial state
-  if (data_buffer_idx > 0 && allocation_info != nullptr) {
-    auto planned_ptr = getMemPlannedPtr(allocation_info, nbytes, allocator);
-    if (!planned_ptr.ok()) {
-      return planned_ptr.error();
-    }
-    auto err = TensorParser::load_mutable_subsegment_into(
-        program, 0, s_tensor->data_buffer_idx(), nbytes, planned_ptr.get());
-
-    if (err != Error::Ok) {
-      return err;
-    }
-    return planned_ptr;
-  }
-
   // External tensors.
-  else if (
+  if (
       s_tensor->extra_tensor_info() != nullptr &&
       s_tensor->extra_tensor_info()->location() ==
           executorch_flatbuffer::TensorDataLocation::EXTERNAL) {
@@ -232,10 +217,9 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
 
       return planned_ptr;
     }
-  }
 
   // Constant, stored in PTE file.
-  else if (data_buffer_idx > 0 && allocation_info == nullptr) {
+  } else if (data_buffer_idx > 0 && allocation_info == nullptr) {
     auto const_data =
         program->get_constant_buffer_data(data_buffer_idx, nbytes);
     if (!const_data.ok()) {
@@ -246,7 +230,21 @@ ET_NODISCARD Result<void*> getTensorDataPtr(
     // guarantee that this data is never modified.
     return const_cast<void*>(const_data.get());
 
-    // Memory planned, no initial state
+  // Memory Planned, with initial state
+  } else if (data_buffer_idx > 0 && allocation_info != nullptr) {
+    auto planned_ptr = getMemPlannedPtr(allocation_info, nbytes, allocator);
+    if (!planned_ptr.ok()) {
+      return planned_ptr.error();
+    }
+    auto err = TensorParser::load_mutable_subsegment_into(
+        program, 0, s_tensor->data_buffer_idx(), nbytes, planned_ptr.get());
+
+    if (err != Error::Ok) {
+      return err;
+    }
+    return planned_ptr;
+
+  // Memory planned, no initial state
   } else if (data_buffer_idx == 0 && allocation_info != nullptr) {
     return getMemPlannedPtr(allocation_info, nbytes, allocator);
 
diff --git a/test/models/export_program.py b/test/models/export_program.py
@@ -276,6 +276,9 @@ def main() -> None:
             prog.write_to_file(fp)
             print(f"Exported {module_name} and wrote program data to {outfile}")
 
+        if args.external_constants:
+            # current infra doesnt easily allow renaming this file, so just hackily do it here.
+            prog._tensor_data[f"{module_name}"] = prog._tensor_data.pop("_default_external_constant")
         prog.write_tensor_data_to_file(args.outdir)
 
 
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
@@ -90,13 +90,21 @@ def define_common_targets():
         # case, and typically shouldn't be done.
         _is_external_target = True,
     )
+
+    # Class names of nn.Modules for :exported_programs to export.
+    MODULES_AND_DATA_TO_EXPORT = [
+        "ModuleLinear",
+        "ModuleSimpleTrain",
+    ]
     
     runtime.genrule(
         name = "exported_program_and_data",
-        cmd = "$(exe :export_program) --modules ModuleLinear --external-constants --outdir $OUT",
+        cmd = "$(exe :export_program) --modules " + ",".join(MODULES_AND_DATA_TO_EXPORT) + " --external-constants --outdir $OUT",
         outs = {
             "ModuleLinear.pte": ["ModuleLinearProgram.pte"],
-            "ModuleLinear.ptd": ["_default_external_constant.ptd"],
+            "ModuleLinear.ptd": ["ModuleLinearProgram.ptd"],
+            "ModuleSimpleTrainProgram.pte": ["ModuleSimpleTrainProgram.pte"],
+            "ModuleSimpleTrain.ptd": ["ModuleSimpleTrainProgram.ptd"],
         },
         default_outs = ["."],
         visibility = [