SGD and TrainingModule in Python (pytorch#5847)

JacobSzwejbka · facebook-github-bot · commit 7d8cb1baa787 · 2024-10-25T09:53:01.000-07:00
Summary:

Add short term pybindings and training module api. The optimizer bindings are probably fairly stable in practice, but the training module is not. Training module is a wrapper around the existing ET pybindings which are under active development. A future PR will update the training ones to match the long term inference bindings.

Reviewed By: dvorjackz

Differential Revision: D63650449
diff --git a/examples/llm_pte_finetuning/runner.py b/examples/llm_pte_finetuning/runner.py
@@ -98,7 +98,7 @@ def main() -> None:
             # for us to update with the gradients in-place.
             # See https://github.com/pytorch/executorch/blob/main/extension/pybindings/pybindings.cpp#L736
             # for more info.
-            out = et_mod.forward((tokens, labels), clone_outputs=False)  # pyre-ignore
+            out = et_mod.forward((tokens, labels), clone_outputs=False)
 
             loss = out[0]
             losses.append(loss.item())
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
@@ -33,11 +33,20 @@ class ExecuTorchModule:
     """
 
     # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
-    def __call__(self, inputs: Any) -> List[Any]: ...
+    def __call__(self, inputs: Any, clone_outputs: bool = True) -> List[Any]: ...
     # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
-    def run_method(self, method_name: str, inputs: Sequence[Any]) -> List[Any]: ...
+    def run_method(
+        self,
+        method_name: str,
+        inputs: Sequence[Any],  # pyre-ignore[2]: "Any" in parameter type annotations.
+        clone_outputs: bool = True,
+    ) -> List[Any]: ...
     # pyre-ignore[2, 3]: "Any" in parameter and return type annotations.
-    def forward(self, inputs: Sequence[Any]) -> List[Any]: ...
+    def forward(
+        self,
+        inputs: Sequence[Any],  # pyre-ignore[2]: "Any" in parameter type annotations.
+        clone_outputs: bool = True,
+    ) -> List[Any]: ...
     # pyre-ignore[3]: "Any" in return type annotations.
     def plan_execute(self) -> List[Any]: ...
     # Bundled program methods.
diff --git a/extension/training/TARGETS b/extension/training/TARGETS
@@ -0,0 +1,20 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
+
+python_library(
+    name = "lib",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = [
+        "//executorch/extension/training/pybindings:_training_lib",
+        "//executorch/extension/training/pybindings:_training_module",
+    ],
+)
diff --git a/extension/training/__init__.py b/extension/training/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.extension.training.pybindings._training_lib import get_sgd_optimizer
+
+from executorch.extension.training.pybindings._training_module import (
+    _load_for_executorch_for_training,
+    _load_for_executorch_for_training_from_buffer,
+    TrainingModule,
+)
+
+__all__ = [
+    "get_sgd_optimizer",
+    "TrainingModule",
+    "_load_for_executorch_for_training_from_buffer",
+    "_load_for_executorch_for_training",
+]
diff --git a/extension/training/pybindings/TARGETS b/extension/training/pybindings/TARGETS
@@ -0,0 +1,40 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
+
+runtime.cxx_python_extension(
+    name = "_training_lib",
+    srcs = [
+        "_training_lib.cpp",
+    ],
+    base_module = "executorch.extension.training.pybindings",
+    types = ["_training_lib.pyi"],
+    visibility = ["//executorch/extension/training/..."],
+    deps = [
+        "//executorch/extension/aten_util:aten_bridge",
+        "//executorch/extension/training/optimizer:sgd",
+    ],
+    external_deps = [
+        "pybind11",
+        "libtorch_python",
+    ],
+)
+
+runtime.python_library(
+    name = "_training_module",
+    srcs = [
+        "_training_module.py",
+    ],
+    base_module = "executorch.extension.training.pybindings",
+    visibility = ["//executorch/extension/training/..."],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/extension/pybindings:portable_lib",
+    ],
+)
diff --git a/extension/training/pybindings/_training_lib.cpp b/extension/training/pybindings/_training_lib.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <memory>
+
+#include <ATen/Tensor.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <torch/csrc/utils/pybind.h>
+#include "executorch/extension/tensor/tensor.h"
+#include "executorch/extension/training/optimizer/sgd.h"
+#ifndef USE_ATEN_LIB
+#include <executorch/extension/aten_util/aten_bridge.h>
+#endif
+
+namespace py = pybind11;
+
+namespace executorch {
+namespace extension {
+namespace training {
+
+namespace {
+
+struct PySGD final {
+  explicit PySGD(
+      const py::dict& named_params,
+      double lr,
+      double momentum,
+      double dampening,
+      double weight_decay,
+      bool nesterov)
+      : sgd_(nullptr),
+        fqns_()
+#ifndef USE_ATEN_LIB
+        ,
+        params_()
+#endif
+  {
+    std::map<exec_aten::string_view, exec_aten::Tensor> cpp_inputs;
+    auto py_named_params =
+        py::cast<std::unordered_map<std::string, at::Tensor>>(named_params);
+    const auto params_size = py::len(named_params);
+    fqns_ = std::vector<std::string>();
+    fqns_.reserve(params_size);
+
+    for (auto pair : py_named_params) {
+      fqns_.push_back(pair.first);
+      exec_aten::string_view v{fqns_.back().c_str(), pair.first.size()};
+#ifndef USE_ATEN_LIB
+      // convert at::Tensor to torch::executor::Tensor
+      params_.emplace_back(alias_tensor_ptr_to_attensor(pair.second));
+      cpp_inputs.insert({v, *params_.back()});
+#else
+      cpp_inputs.insert({v, pair.second});
+#endif
+    }
+    sgd_ = std::make_unique<optimizer::SGD>(
+        cpp_inputs,
+        extension::training::optimizer::SGDOptions(
+            lr, momentum, dampening, weight_decay, nesterov));
+  }
+
+  // Not needed for now, so just delete.
+  PySGD(const PySGD&) = delete;
+  PySGD& operator=(const PySGD&) = delete;
+  PySGD(PySGD&&) = delete;
+  PySGD& operator=(PySGD&&) = delete;
+
+  void step(const py::dict& py_dict) {
+    auto py_named_gradients =
+        py::cast<std::unordered_map<std::string, at::Tensor>>(py_dict);
+    const auto inputs_size = py::len(py_dict);
+    std::map<exec_aten::string_view, exec_aten::Tensor> cpp_inputs;
+
+    std::vector<std::string> fqn;
+#ifndef USE_ATEN_LIB
+    std::vector<TensorPtr> et_tensors;
+#endif
+
+    // Convert python objects into cpp.
+    for (const auto& pair : py_named_gradients) {
+      fqn.push_back(pair.first);
+      auto at_tensor = pair.second;
+      // alias_etensor_to_attensor will assert on this later, so to better
+      // propogate up to python we check early and throw an exception.
+      if (!at_tensor.is_contiguous()) {
+        auto error_msg = "Gradient is not contiguous.";
+        throw std::runtime_error(error_msg);
+      }
+#ifndef USE_ATEN_LIB
+      // convert at::Tensor to torch::executor::Tensor
+      auto temp = alias_tensor_ptr_to_attensor(at_tensor);
+      et_tensors.push_back(temp);
+      cpp_inputs.insert({pair.first.c_str(), *et_tensors.back()});
+#else
+      cpp_inputs.insert({pair.first.c_str(), at_tensor});
+#endif
+    }
+
+    auto err = sgd_->step(cpp_inputs);
+    if (err != runtime::Error::Ok) {
+      throw std::runtime_error("SGD step failed");
+    }
+  }
+
+ private:
+  // TODO(jakeszwe): Write an optimizer interface and use it here instead of SGD
+  // specifically.
+  std::unique_ptr<optimizer::SGD> sgd_ = nullptr;
+  std::vector<std::string> fqns_;
+
+#ifndef USE_ATEN_LIB // Portable mode
+  std::vector<TensorPtr> params_;
+#endif
+  ;
+};
+
+static std::unique_ptr<PySGD> get_sgd_optimizer(
+    const py::dict& named_params,
+    double lr,
+    double momentum = 0,
+    double dampening = 0,
+    double weight_decay = 0,
+    bool nesterov = false) {
+  return std::make_unique<PySGD>(
+      named_params, lr, momentum, dampening, weight_decay, nesterov);
+}
+
+} // namespace
+
+PYBIND11_MODULE(_training_lib, m) {
+  m.def(
+      "get_sgd_optimizer",
+      &get_sgd_optimizer,
+      py::arg("named_params"),
+      py::arg("lr") = 0.1,
+      py::arg("momentum") = 0.0,
+      py::arg("dampening") = 0.0,
+      py::arg("weight_decay") = 0.0,
+      py::arg("nesterov") = false);
+  py::class_<PySGD>(m, "ExecuTorchSGD").def("step", &PySGD::step);
+}
+
+} // namespace training
+} // namespace extension
+} // namespace executorch
diff --git a/extension/training/pybindings/_training_lib.pyi b/extension/training/pybindings/_training_lib.pyi
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+
+from executorch.exir._warnings import experimental
+from torch import Tensor
+
+@experimental("This API is experimental and subject to change without notice.")
+class ExecuTorchSGD:
+    """SGD Optimizer.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    """
+
+    def step(self, named_gradients: Dict[str, Tensor]) -> None:
+        """Take a step in the direction of the gradients."""
+        ...
+
+@experimental("This API is experimental and subject to change without notice.")
+def get_sgd_optimizer(
+    named_parameters: Dict[str, Tensor],
+    lr: float,
+    momentum: float = 0,
+    dampening: float = 0,
+    weight_decay: float = 0,
+    nesterov: bool = False,
+) -> ExecuTorchSGD:
+    """Creates an sgd optimizer that operates on the passed in named_parameters according to the specified hyper parameters.
+
+    .. warning::
+
+        This API is experimental and subject to change without notice.
+    ...
+    """
+    ...
diff --git a/extension/training/pybindings/_training_module.py b/extension/training/pybindings/_training_module.py
@@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Dict, List, Sequence
+
+from executorch.exir._warnings import experimental
+
+from executorch.extension.pybindings.portable_lib import (
+    _load_for_executorch,
+    _load_for_executorch_from_buffer,
+    ExecuTorchModule,
+)
+from torch import Tensor
+
+
+@experimental("This API is experimental and subject to change without notice.")
+class TrainingModule:
+    def __init__(self, module: ExecuTorchModule):
+        self.model = module
+
+        self.gradients_method_prefix = "__et_training_gradients_index_"
+        self.parameters_method_prefix = "__et_training_parameters_index_"
+        self.fqn_method_prefix = "__et_training_fqn_"
+
+        self.named_grads = None
+        self.named_params = None
+
+    def forward_backward(self, method_name: str, inputs: Sequence[Any]) -> List[Any]:
+        # The default ET model returns a large list of outputs that can logically be
+        # separated into [user outputs, gradients, parameters]. Can use these metadata
+        # methods to slice the list into the correct parts.
+        grad_start_idx = self.model.run_method(
+            self.gradients_method_prefix + method_name, ()
+        )[0]
+        params_start_idx = self.model.run_method(
+            self.parameters_method_prefix + method_name, ()
+        )[0]
+
+        full_outputs = self.model.run_method(method_name, inputs)
+
+        user_outs = full_outputs[:grad_start_idx]
+        grads = full_outputs[grad_start_idx:params_start_idx]
+        params = full_outputs[params_start_idx:]
+
+        # Important that the outputs are not cloned because we need the optimizer to 
+        # be able to mutate the actual weights and not clones of them.
+        fqn = self.model.run_method(
+            self.fqn_method_prefix + method_name, (), clone_outputs=False
+        )
+
+        self.named_grads = dict(zip(fqn, grads))
+        if self.named_params is None:
+            self.named_params = dict(zip(fqn, params))
+
+        return user_outs
+
+    def named_gradients(self) -> Dict[str, Tensor]:
+        if self.named_grads is None:
+            raise RuntimeError("Must call forward_backward before named_grads")
+        return self.named_grads
+
+    def named_parameters(self) -> Dict[str, Tensor]:
+        if self.named_grads is None:
+            raise RuntimeError(
+                "Must call forward_backward before named_params. This will be fixed in a later version"
+            )
+        return self.named_params
+
+
+@experimental("This API is experimental and subject to change without notice.")
+def _load_for_executorch_for_training(path: str) -> TrainingModule:
+    et_module = _load_for_executorch(path)
+    return TrainingModule(et_module)
+
+
+@experimental("This API is experimental and subject to change without notice.")
+def _load_for_executorch_for_training_from_buffer(
+    buffer: bytes,
+) -> TrainingModule:
+    et_module = _load_for_executorch_from_buffer(buffer)
+    return TrainingModule(et_module)
diff --git a/extension/training/pybindings/targets.bzl b/extension/training/pybindings/targets.bzl
diff --git a/extension/training/pybindings/test/TARGETS b/extension/training/pybindings/test/TARGETS
diff --git a/extension/training/pybindings/test/test.py b/extension/training/pybindings/test/test.py
diff --git a/extension/training/targets.bzl b/extension/training/targets.bzl