sandeepgupta12
diff --git a/‎caffe2/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎caffe2/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎torch/_C/_aoti.pyi‎
Lines changed: 1 addition & 0 deletions b/‎torch/_C/_aoti.pyi‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/inductor/aoti_include/mps.h‎
Lines changed: 4 additions & 0 deletions b/‎torch/csrc/inductor/aoti_include/mps.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp‎
Lines changed: 39 additions & 0 deletions b/‎torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎torch/csrc/inductor/aoti_runner/model_container_runner_mps.h‎
Lines changed: 18 additions & 0 deletions b/‎torch/csrc/inductor/aoti_runner/model_container_runner_mps.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎torch/csrc/inductor/aoti_runner/pybind.cpp‎
Lines changed: 38 additions & 0 deletions b/‎torch/csrc/inductor/aoti_runner/pybind.cpp‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎torch/csrc/inductor/aoti_runtime/model.h‎
Lines changed: 5 additions & 1 deletion b/‎torch/csrc/inductor/aoti_runtime/model.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎torch/csrc/inductor/aoti_torch/c/shim.h‎
Lines changed: 1 addition & 0 deletions b/‎torch/csrc/inductor/aoti_torch/c/shim.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/inductor/aoti_torch/c/shim_mps.h‎
Lines changed: 22 additions & 0 deletions b/‎torch/csrc/inductor/aoti_torch/c/shim_mps.h‎
Lines changed: 22 additions & 0 deletions
@@ -275,6 +275,11 @@ if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
       "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterLazy.cpp"
     )
   endif()
+  if(USE_MPS)
+    list(APPEND GENERATED_CXX_TORCH
+      "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_mps.cpp"
+    )
+  endif()
 endif()
 
 set(GENERATED_H_TORCH
@@ -703,6 +708,8 @@ list(APPEND Caffe2_CPU_SRCS ${TORCH_SRCS})
 
 if(USE_MPS)
   list(APPEND Caffe2_CPU_SRCS ${Caffe2_MPS_SRCS})
+  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/shim_mps.cpp)
+  list(APPEND Caffe2_CPU_SRCS ${TORCH_SRC_DIR}/csrc/inductor/aoti_runner/model_container_runner_mps.cpp)
   if(CAN_COMPILE_METAL)
       file(TOUCH ${CMAKE_BINARY_DIR}/aten/src/ATen/metallib_dummy.cpp)
       list(APPEND Caffe2_CPU_SRCS ${CMAKE_BINARY_DIR}/aten/src/ATen/metallib_dummy.cpp)
 
@@ -19,6 +19,7 @@ def alloc_tensor_by_stealing_from_void_ptr(
 class AOTIModelContainerRunnerCpu: ...
 class AOTIModelContainerRunnerCuda: ...
 class AOTIModelContainerRunnerXpu: ...
+class AOTIModelContainerRunnerMps: ...
 
 # Defined in torch/csrc/inductor/aoti_package/pybind.cpp
 class AOTIModelPackageLoader: ...
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_include/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/mps.h>
@@ -0,0 +1,39 @@
+#if defined(__APPLE__)
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_mps.h>
+
+namespace torch::inductor {
+
+AOTIModelContainerRunnerMps::AOTIModelContainerRunnerMps(
+    const std::string& model_so_path,
+    size_t num_models,
+    bool run_single_threaded)
+    : AOTIModelContainerRunner(
+          model_so_path,
+          num_models,
+          "mps",
+          "",
+          run_single_threaded) {}
+
+AOTIModelContainerRunnerMps::~AOTIModelContainerRunnerMps() = default;
+
+namespace {
+std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_mps(
+    const std::string& model_so_path,
+    size_t num_models,
+    const std::string& device_str,
+    const std::string& cubin_dir,
+    const bool run_single_threaded) {
+  if (device_str != "mps") {
+    throw std::runtime_error("Incorrect device passed to aoti_runner_mps");
+  }
+  return std::make_unique<AOTIModelContainerRunnerMps>(
+      model_so_path, num_models, run_single_threaded);
+}
+} // namespace
+
+static RegisterAOTIModelRunner register_mps_runner(
+    "mps",
+    &create_aoti_runner_mps);
+
+} // namespace torch::inductor
+#endif
@@ -0,0 +1,18 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+
+namespace torch::inductor {
+class TORCH_API AOTIModelContainerRunnerMps : public AOTIModelContainerRunner {
+ public:
+  AOTIModelContainerRunnerMps(
+      const std::string& model_so_path,
+      size_t num_models = 1,
+      const bool run_single_threaded = false);
+
+  ~AOTIModelContainerRunnerMps() override;
+};
+
+} // namespace torch::inductor
+#endif
@@ -5,6 +5,9 @@
 #ifdef USE_XPU
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_xpu.h>
 #endif
+#ifdef __APPLE__
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_mps.h>
+#endif
 #include <torch/csrc/inductor/aoti_runner/pybind.h>
 #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
@@ -130,6 +133,41 @@ void initAOTIRunnerBindings(PyObject* module) {
           "free_inactive_constant_buffer",
           &AOTIModelContainerRunnerXpu::free_inactive_constant_buffer);
 
+#endif
+#ifdef __APPLE__
+  py::class_<AOTIModelContainerRunnerMps>(m, "AOTIModelContainerRunnerMps")
+      .def(py::init<const std::string&, int>())
+      .def(
+          "run",
+          &AOTIModelContainerRunnerMps::run,
+          py::arg("inputs"),
+          py::arg("stream_handle") = nullptr)
+      .def("get_call_spec", &AOTIModelContainerRunnerMps::get_call_spec)
+      .def(
+          "get_constant_names_to_original_fqns",
+          &AOTIModelContainerRunnerMps::getConstantNamesToOriginalFQNs)
+      .def(
+          "get_constant_names_to_dtypes",
+          &AOTIModelContainerRunnerMps::getConstantNamesToDtypes)
+      .def(
+          "extract_constants_map",
+          &AOTIModelContainerRunnerMps::extract_constants_map)
+      .def(
+          "update_constant_buffer",
+          static_cast<void (AOTIModelContainerRunnerMps::*)(
+              std::unordered_map<std::string, at::Tensor>&, bool, bool, bool)>(
+              &AOTIModelContainerRunnerMps::update_constant_buffer),
+          py::arg("tensor_map"),
+          py::arg("use_inactive"),
+          py::arg("validate_full_updates"),
+          py::arg("user_managed") = false)
+      .def(
+          "swap_constant_buffer",
+          &AOTIModelContainerRunnerMps::swap_constant_buffer)
+      .def(
+          "free_inactive_constant_buffer",
+          &AOTIModelContainerRunnerMps::free_inactive_constant_buffer);
+
 #endif
 
   m.def(
 
@@ -100,7 +100,7 @@ inline void parse_device_str(
     const std::string& device_str,
     int32_t& device_type,
     int32_t& device_idx) {
-  std::regex re("(cpu|cuda|xpu)(:([0-9]+))?");
+  std::regex re("(cpu|cuda|xpu|mps)(:([0-9]+))?");
   std::smatch sm;
   bool matched = std::regex_match(device_str, sm, re);
   AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str);
@@ -112,6 +112,10 @@ inline void parse_device_str(
 #ifdef USE_XPU
   } else if (sm[1].str() == "xpu") {
     device_type = aoti_torch_device_type_xpu();
+#endif
+#ifdef __APPLE__
+  } else if (sm[1].str() == "mps") {
+    device_type = aoti_torch_device_type_mps();
 #endif
   } else {
     AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
 
@@ -106,6 +106,7 @@ AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cpu();
 AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cuda();
 AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_meta();
 AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_xpu();
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_mps();
 AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_privateuse1();
 
 AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e5m2();
 
@@ -0,0 +1,22 @@
+#ifndef AOTI_TORCH_SHIM_MPS
+#define AOTI_TORCH_SHIM_MPS
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AOTIMetalKernelFunctionOpaque;
+using AOTIMetalKernelFunctionHandle = AOTIMetalKernelFunctionOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg(
+    AOTIMetalKernelFunctionHandle func,
+    unsigned idx,
+    AtenTensorHandle tensor);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOTI_TORCH_SHIM_MPS