set_output_data_ptr api (#223)

JacobSzwejbka · facebook-github-bot · commit 094d00e905e2 · 2023-09-08T21:29:44.000-07:00
Summary: Pull Request resolved: #223 People really shouldnt be using get_output and mutating the structure, this provides a way to set the output data ptr in a more controlled manner Reviewed By: iseeyuan Differential Revision: D49029435 fbshipit-source-id: 44f527d99a0d2c50bbe5a022757adcbd4f7ae20f
diff --git a/runtime/core/exec_aten/util/tensor_util.h b/runtime/core/exec_aten/util/tensor_util.h
@@ -910,6 +910,12 @@ __ET_NODISCARD Error copy_tensor_data(
     const exec_aten::Tensor& t_dst,
     const exec_aten::Tensor& t_src);
 
+/**
+ * Set the data_ptr of t to buffer.
+ */
+__ET_NODISCARD Error
+set_tensor_data(const exec_aten::Tensor& t, void* buffer, size_t buffer_size);
+
 /**
  * Reset tensor's data_ptr, clear all the storage for at::Tensor.
  */
diff --git a/runtime/core/exec_aten/util/tensor_util_aten.cpp b/runtime/core/exec_aten/util/tensor_util_aten.cpp
@@ -133,6 +133,19 @@ Error copy_tensor_data(const at::Tensor& t_dst, const at::Tensor& t_src) {
   return Error::Ok;
 }
 
+__ET_NODISCARD Error
+set_tensor_data(const at::Tensor& t, void* buffer, size_t buffer_size) {
+  ET_CHECK_OR_RETURN_ERROR(
+      buffer_size >= t.nbytes(),
+      InvalidArgument,
+      "buffer_size %zu is smaller than smaller than tensor nbytes %zu",
+      buffer_size,
+      t.nbytes());
+  t.unsafeGetTensorImpl()->unsafe_storage().set_data_ptr(
+      at::DataPtr(buffer, DeviceType::CPU));
+  return Error::Ok;
+}
+
 void reset_data_ptr(const at::Tensor& tensor) {
   auto impl = tensor.unsafeGetTensorImpl();
   impl->set_sizes_contiguous(0);
diff --git a/runtime/core/exec_aten/util/tensor_util_portable.cpp b/runtime/core/exec_aten/util/tensor_util_portable.cpp
@@ -116,6 +116,20 @@ Error copy_tensor_data(
   return Error::Ok;
 }
 
+__ET_NODISCARD Error set_tensor_data(
+    const torch::executor::Tensor& t,
+    void* buffer,
+    size_t buffer_size) {
+  ET_CHECK_OR_RETURN_ERROR(
+      buffer_size >= t.nbytes(),
+      InvalidArgument,
+      "buffer_size %zu is smaller than smaller than tensor nbytes %zu",
+      buffer_size,
+      t.nbytes());
+  t.unsafeGetTensorImpl()->set_data(buffer);
+  return Error::Ok;
+}
+
 void reset_data_ptr(const torch::executor::Tensor& tensor) {
   // Lean mode doesn't deallocate the tensor data_ptr in the allocator
   tensor.set_data(nullptr);
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
@@ -632,6 +632,17 @@ Error Method::init(executorch_flatbuffer::ExecutionPlan* s_plan) {
     }
   }
 
+  pre_allocated_output_ = false;
+
+  // Get pre_allocation info for output tensors
+  for (int i = 0; i < outputs_size(); i++) {
+    if (get_output(i).isTensor()) {
+      pre_allocated_output_ =
+          get_output(i).toTensor().const_data_ptr() != nullptr;
+      break;
+    }
+  }
+
   ET_CHECK_OR_RETURN_ERROR(
       n_chains_ > 0,
       Internal,
@@ -799,6 +810,51 @@ Method::set_inputs(const exec_aten::ArrayRef<EValue>& input_evalues) {
   return Error::Ok;
 }
 
+__ET_NODISCARD Error
+Method::set_output_data_ptr(void* buffer, size_t size, size_t output_idx) {
+  // Check method state
+  ET_CHECK_OR_RETURN_ERROR(
+      initialized(),
+      InvalidState,
+      "Outputs can not be retrieved until method has been initialized.");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      !pre_allocated_output_,
+      InvalidState,
+      "Overriding output data pointer allocated by memory plan is not allowed.");
+
+  // Check the args
+  ET_CHECK_OR_RETURN_ERROR(
+      output_idx <= outputs_size(),
+      InvalidArgument,
+      "output_idx: %zu num_outputs: %zu",
+      output_idx,
+      outputs_size());
+
+  auto& output = mutable_output(output_idx);
+  ET_CHECK_OR_RETURN_ERROR(
+      output.isTensor(),
+      InvalidArgument,
+      "output type: %zu is not tensor",
+      (size_t)output.tag);
+
+  auto& t = output.toTensor();
+  ET_CHECK_OR_RETURN_ERROR(
+      output.isTensor(),
+      InvalidArgument,
+      "output type: %zu is not tensor",
+      (size_t)output.tag);
+  ET_CHECK_OR_RETURN_ERROR(
+      t.nbytes() <= size,
+      InvalidArgument,
+      "buffer size: %zu is smaller then expected tensor size: %zu",
+      size,
+      t.nbytes());
+
+  // Set data
+  return internal::set_tensor_data(t, buffer, size);
+}
+
 __ET_NODISCARD Error
 Method::get_outputs(EValue* output_evalues, size_t length) {
   ET_CHECK_OR_RETURN_ERROR(
diff --git a/runtime/executor/method.h b/runtime/executor/method.h
@@ -63,7 +63,8 @@ class Method final {
         n_chains_(0),
         chains_(nullptr),
         init_state_(InitializationState::Uninitialized),
-        pre_allocated_input_(false) {}
+        pre_allocated_input_(false),
+        pre_allocated_output_(false) {}
 
   /**
    * Move ctor. Takes ownership of resources previously owned by `rhs`,
@@ -82,7 +83,8 @@ class Method final {
         n_chains_(rhs.n_chains_),
         chains_(rhs.chains_),
         init_state_(rhs.init_state_),
-        pre_allocated_input_(rhs.pre_allocated_input_) {
+        pre_allocated_input_(rhs.pre_allocated_input_),
+        pre_allocated_output_(rhs.pre_allocated_output_) {
     // Required: clear out fields that the dtor looks at, so that we don't free
     // anything twice.
     rhs.n_value_ = 0;
@@ -97,10 +99,11 @@ class Method final {
     rhs.program_ = nullptr;
     rhs.memory_manager_ = nullptr;
     rhs.serialization_plan_ = nullptr;
+    rhs.event_tracer_ = nullptr;
     rhs.n_chains_ = 0;
     rhs.chains_ = nullptr;
     rhs.pre_allocated_input_ = false;
-    rhs.event_tracer_ = nullptr;
+    rhs.pre_allocated_output_ = false;
   }
 
   /**
@@ -144,6 +147,28 @@ class Method final {
   __ET_NODISCARD Error
   set_inputs(const exec_aten::ArrayRef<EValue>& input_evalues);
 
+  /**
+   * Sets the data buffer of the specified method output to the provided value.
+   *
+   * NOTE: Based on the memory plan of the method, the output tensors may not
+   * have buffer space pre-allocated for them, in this case the executor will
+   * point those tensors to the buffer provided here, so the user should take
+   * care that the life span of this memory outlasts the executor forward.
+   *
+   * @param[in] buffer The block of memory to point the specified tensor at.
+   *
+   * @param[in] size the length of buffer in bytes, must be >= the nbytes of the
+   * specified tensor.
+   *
+   * @param[in] output_idx The index of the output to set the data_ptr for. Must
+   *     correspond to a tensor, and that tensor must not have had a buffer
+   *     allocated by the memory plan.
+   *
+   * @returns Error::Ok on success, non-Ok on failure.
+   */
+  __ET_NODISCARD Error
+  set_output_data_ptr(void* buffer, size_t size, size_t output_idx);
+
   /**
    * Copies the method's outputs into the provided array.
    *
@@ -263,6 +288,7 @@ class Method final {
 
   InitializationState init_state_;
   bool pre_allocated_input_;
+  bool pre_allocated_output_;
 
   /**
    * Parses the elements of the values_ array. On error, n_value_ will be set to
diff --git a/runtime/executor/test/method_test.cpp b/runtime/executor/test/method_test.cpp
@@ -33,46 +33,41 @@ constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024U;
 
 class MethodTest : public ::testing::Test {
  protected:
-  void SetUp() override {
-    // Create a loader for the serialized ModuleAdd program.
-    const char* path = std::getenv("ET_MODULE_ADD_PATH");
+  void load_program(const char* path, const char* module_name) {
+    // Create a loader for the serialized program.
     Result<FileDataLoader> loader = FileDataLoader::From(path);
     ASSERT_EQ(loader.error(), Error::Ok);
-    add_loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+    loaders_.insert(
+        {module_name,
+         std::make_unique<FileDataLoader>(std::move(loader.get()))});
 
     // Use it to load the program.
     Result<Program> program = Program::Load(
-        add_loader_.get(), Program::Verification::InternalConsistency);
+        loaders_[module_name].get(),
+        Program::Verification::InternalConsistency);
     ASSERT_EQ(program.error(), Error::Ok);
-    add_program_ = std::make_unique<Program>(std::move(program.get()));
-
-    // Create a loader for the serialized ModuleIndex program.
-    const char* index_path = std::getenv("ET_MODULE_INDEX_PATH");
-    Result<FileDataLoader> index_loader = FileDataLoader::From(index_path);
-    ASSERT_EQ(index_loader.error(), Error::Ok);
-    index_loader_ =
-        std::make_unique<FileDataLoader>(std::move(index_loader.get()));
+    programs_.insert(
+        {module_name, std::make_unique<Program>(std::move(program.get()))});
+  }
 
-    // Use it to load the program.
-    Result<Program> index_program = Program::Load(
-        index_loader_.get(), Program::Verification::InternalConsistency);
-    ASSERT_EQ(index_program.error(), Error::Ok);
-    index_program_ = std::make_unique<Program>(std::move(index_program.get()));
+  void SetUp() override {
+    load_program(std::getenv("ET_MODULE_ADD_PATH"), "add");
+    load_program(std::getenv("ET_MODULE_INDEX_PATH"), "index");
+    load_program(
+        std::getenv("ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH"), "cat");
   }
 
  private:
   // Must outlive program_, but tests shouldn't need to touch it.
-  std::unique_ptr<FileDataLoader> add_loader_;
-  std::unique_ptr<FileDataLoader> index_loader_;
+  std::unordered_map<std::string, std::unique_ptr<FileDataLoader>> loaders_;
 
  protected:
-  std::unique_ptr<Program> add_program_;
-  std::unique_ptr<Program> index_program_;
+  std::unordered_map<std::string, std::unique_ptr<Program>> programs_;
 };
 
 TEST_F(MethodTest, MoveTest) {
   ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
-  Result<Method> method = add_program_->load_method("forward", &mmm.get());
+  Result<Method> method = programs_["add"]->load_method("forward", &mmm.get());
   ASSERT_EQ(method.error(), Error::Ok);
 
   // Can execute the method.
@@ -97,7 +92,7 @@ TEST_F(MethodTest, MoveTest) {
 
 TEST_F(MethodTest, SetPrimInputTest) {
   ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
-  Result<Method> method = add_program_->load_method("forward", &mmm.get());
+  Result<Method> method = programs_["add"]->load_method("forward", &mmm.get());
   ASSERT_EQ(method.error(), Error::Ok);
 
   // Can execute the method.
@@ -121,6 +116,75 @@ TEST_F(MethodTest, SetPrimInputTest) {
   torch::executor::util::FreeInputs(inputs);
 }
 
+TEST_F(MethodTest, AliasedIOTest) {
+  // TODO(T163238401)
+  ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+  Result<Method> method = programs_["cat"]->load_method("forward", &mmm.get());
+  ASSERT_EQ(method.error(), Error::Ok);
+
+  // Set up io. Input and Output should share the same memory.
+  constexpr int buffer_size = 16;
+  float buffer[buffer_size]; // Initial input is (2,4) we then cat a (1,4) to it
+                             // twice for a final shape of (4,4)
+  for (int i = 0; i < buffer_size; ++i) {
+    buffer[i] = 0.f;
+  }
+  int32_t sizes[2] = {2, 4};
+  uint8_t dim_order[2] = {0, 1};
+  int32_t strides[2] = {4, 1};
+  torch::executor::TensorImpl impl(
+      torch::executor::ScalarType::Float, 2, sizes, buffer, dim_order, strides);
+
+  auto input_err = method->set_input(EValue(torch::executor::Tensor(&impl)), 0);
+  ASSERT_EQ(input_err, Error::Ok);
+
+  auto output_err = method->set_output_data_ptr(buffer, sizeof(buffer), 0);
+  ASSERT_EQ(output_err, Error::Ok);
+  ASSERT_EQ(method->get_output(0).toTensor().const_data_ptr(), buffer);
+
+  // Execute the method once. Cat a 1x4 to a 2x4.
+  auto execute_error = method->execute();
+  ASSERT_EQ(execute_error, Error::Ok);
+
+  auto output = method->get_output(0);
+  ASSERT_TRUE(output.isTensor());
+  EXPECT_EQ(output.toTensor().sizes()[0], 3);
+  EXPECT_EQ(output.toTensor().sizes()[1], 4);
+  // Original input should be 0.
+  for (size_t i = 0; i < 2 * 4; i++) {
+    EXPECT_FLOAT_EQ(output.toTensor().const_data_ptr<float>()[i], 0.f);
+  }
+  // Section that was cat on should be 1.
+  for (size_t i = 0; i < 1 * 4; i++) {
+    EXPECT_FLOAT_EQ(
+        output.toTensor().const_data_ptr<float>()[(2 * 4) + i], 1.f);
+  }
+
+  // Set the input again to update the size.
+  sizes[0] = output.toTensor().sizes()[0];
+  torch::executor::TensorImpl impl_2(
+      torch::executor::ScalarType::Float, 2, sizes, buffer, dim_order, strides);
+  input_err = method->set_input(EValue(torch::executor::Tensor(&impl_2)), 0);
+  ASSERT_EQ(input_err, Error::Ok);
+
+  // Execute the method again. Cat a 1x4 to a 3x4.
+  execute_error = method->execute();
+  ASSERT_EQ(execute_error, Error::Ok);
+
+  output = method->get_output(0);
+  EXPECT_EQ(output.toTensor().sizes()[0], 4);
+  EXPECT_EQ(output.toTensor().sizes()[1], 4);
+  // Original input should be 0.
+  for (size_t i = 0; i < 2 * 4; i++) {
+    EXPECT_FLOAT_EQ(output.toTensor().const_data_ptr<float>()[i], 0.f);
+  }
+  // Previous section and the new one that were cat on should be 1.
+  for (size_t i = 0; i < 2 * 4; i++) {
+    EXPECT_FLOAT_EQ(
+        output.toTensor().const_data_ptr<float>()[(2 * 4) + i], 1.f);
+  }
+}
+
 // TODO(T161163608): Test is disabled due to a resize bug in tensor_index_out of
 // the portable op lib
 
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
@@ -103,6 +103,7 @@ def define_common_targets(is_fbcode = False):
             # an fbcode target path because the authoring/export tools
             # intentionally don't work in xplat (since they're host-only tools).
             "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
+            "ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleDynamicCatUnallocatedIO.pte])",
             "ET_MODULE_INDEX_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleIndex.pte])",
             "ET_MODULE_MULTI_ENTRY_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleMultipleEntry.pte])",
         }
diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py
@@ -130,12 +130,26 @@ def return_wrapper():
         for method in methods:
             method_name_to_args[method] = trace_inputs
 
+        method_name_to_constraints = None
+        if hasattr(eager_module, "get_constraints"):
+            assert capture_config is not None
+            assert capture_config.enable_aot is True
+            trace_constraints = eager_module.get_constraints()
+            method_name_to_constraints = {}
+            for method in methods:
+                method_name_to_constraints[method] = trace_constraints
+
+        memory_planning_pass = MemoryPlanningPass("greedy")
+        if hasattr(eager_module, "get_memory_planning_pass"):
+            memory_planning_pass = eager_module.get_memory_planning_pass()
+
         # Capture an executorch program.
         executorch_program = (
             exir.capture_multiple(
                 eager_module,
                 method_name_to_args,
                 capture_config,
+                constraints=method_name_to_constraints,
             )
             .to_edge(exir.EdgeCompileConfig(_check_ir_validity=False))
             .to_executorch(
@@ -150,7 +164,7 @@ def return_wrapper():
                         to_scratch_op_pass,
                     ],
                     dynamic_memory_planning_mode=dynamic_memory_planning_mode,
-                    memory_planning_pass=MemoryPlanningPass("greedy"),
+                    memory_planning_pass=memory_planning_pass,
                     to_out_var_pass=ToOutVarPass(ignore_to_out_var_failure),
                 )
             )
diff --git a/test/models/export_program.py b/test/models/export_program.py
diff --git a/test/models/targets.bzl b/test/models/targets.bzl

Original file line number	Diff line number	Diff line change
`@@ -103,6 +103,7 @@ def define_common_targets(is_fbcode = False):`
`103`	`103`	`# an fbcode target path because the authoring/export tools`
`104`	`104`	`# intentionally don't work in xplat (since they're host-only tools).`
`105`	`105`	`"ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",`
	`106`	`+ "ET_MODULE_DYNAMIC_CAT_UNALLOCATED_IO_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleDynamicCatUnallocatedIO.pte])",`
`106`	`107`	`"ET_MODULE_INDEX_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleIndex.pte])",`
`107`	`108`	`"ET_MODULE_MULTI_ENTRY_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleMultipleEntry.pte])",`
`108`	`109`	`}`