pytorch
diff --git a/‎CMakePresets.json‎
Lines changed: 3 additions & 1 deletion b/‎CMakePresets.json‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/aoti/utils.h‎
Lines changed: 77 additions & 0 deletions b/‎backends/aoti/utils.h‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎backends/apple/metal/metal_backend.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/apple/metal/metal_backend.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/apple/metal/runtime/shims/et_metal_ops.h‎
Lines changed: 22 additions & 0 deletions b/‎backends/apple/metal/runtime/shims/et_metal_ops.h‎
Lines changed: 22 additions & 0 deletions
@@ -183,7 +183,9 @@
       ],
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "Debug",
-        "CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out"
+        "CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out",
+        "EXECUTORCH_ENABLE_LOGGING": "ON",
+        "ET_MIN_LOG_LEVEL": "Info"
       }
     },
     {
 
@@ -161,6 +161,83 @@ inline bool is_contiguous_tensor(
   return true;
 }
 
+// Check if any dimension has a broadcast stride (stride = 0 with size > 1)
+// In PyTorch, stride=0 means the same data is repeated along that dimension
+inline bool has_broadcast_strides(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr) {
+  if (strides_ptr == nullptr) {
+    return false;
+  }
+  for (int64_t i = 0; i < ndim; i++) {
+    if (strides_ptr[i] == 0 && sizes_ptr[i] > 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Materialize strided tensor data to a contiguous buffer
+// Copies element-by-element using the source strides to compute offsets
+// src_data: pointer to source data (may be offset from base)
+// dst_data: pointer to destination buffer (must be pre-allocated)
+// ndim: number of dimensions
+// sizes_ptr: tensor sizes
+// strides_ptr: source strides (must not contain broadcast strides)
+// element_size: size of each element in bytes
+// Returns the number of bytes written
+inline size_t materialize_strided(
+    const void* src_data,
+    void* dst_data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    size_t element_size) {
+  if (ndim == 0) {
+    // Scalar case
+    std::memcpy(dst_data, src_data, element_size);
+    return element_size;
+  }
+
+  // Calculate total elements in output
+  int64_t total_elements = 1;
+  for (int64_t i = 0; i < ndim; i++) {
+    total_elements *= sizes_ptr[i];
+  }
+
+  // Calculate contiguous output strides
+  std::vector<int64_t> out_strides(ndim);
+  out_strides[ndim - 1] = 1;
+  for (int64_t i = ndim - 2; i >= 0; i--) {
+    out_strides[i] = out_strides[i + 1] * sizes_ptr[i + 1];
+  }
+
+  // Copy each element, computing source offset based on input strides
+  const char* src = static_cast<const char*>(src_data);
+  char* dst = static_cast<char*>(dst_data);
+
+  for (int64_t linear_idx = 0; linear_idx < total_elements; linear_idx++) {
+    // Convert linear index to multi-dimensional index and compute source offset
+    int64_t src_offset = 0;
+    int64_t remaining = linear_idx;
+
+    for (int64_t dim = 0; dim < ndim; dim++) {
+      int64_t coord = remaining / out_strides[dim];
+      remaining = remaining % out_strides[dim];
+      src_offset += coord * strides_ptr[dim];
+    }
+
+    // Copy one element
+    std::memcpy(
+        dst + linear_idx * element_size,
+        src + src_offset * element_size,
+        element_size);
+  }
+
+  return total_elements * element_size;
+}
+
 } // namespace aoti
 } // namespace backends
 } // namespace executorch
@@ -32,6 +32,7 @@ def get_device_name(cls) -> str:
     def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
         return {
             "aoti_torch_mps_addmm_out": None,
+            "aoti_torch_mps_bmm_out": None,
             "aoti_torch_mps_convolution": None,
             "aoti_torch_mps_mm_out": None,
             "at::_ops::_scaled_dot_product_attention_math_for_mps::call": None,
 
@@ -27,6 +27,28 @@ AOTITorchError aoti_torch_mps_mm_out(
     AOTITensorHandle self,
     AOTITensorHandle mat2);
 
+/**
+ * ExecutorTorch implementation of aoti_torch_mps_addmm_out.
+ * Performs: out = beta * input + alpha * (mat1 @ mat2)
+ */
+AOTITorchError aoti_torch_mps_addmm_out(
+    AOTITensorHandle out,
+    AOTITensorHandle input,
+    AOTITensorHandle mat1,
+    AOTITensorHandle mat2,
+    int64_t beta,
+    int64_t alpha);
+
+/**
+ * ExecutorTorch implementation of aoti_torch_mps_bmm_out.
+ * Performs batched matrix multiplication: out = self @ mat2
+ * All tensors must be 3-D with matching batch dimensions.
+ */
+AOTITorchError aoti_torch_mps_bmm_out(
+    AOTITensorHandle out,
+    AOTITensorHandle self,
+    AOTITensorHandle mat2);
+
 /**
  * ExecutorTorch implementation of aoti_torch_mps_convolution.
  * Performs 2D convolution operation - matches PyTorch AOTI signature
Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,9 @@`
`183`	`183`	`],`
`184`	`184`	`"cacheVariables": {`
`185`	`185`	`"CMAKE_BUILD_TYPE": "Debug",`
`186`		`- "CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out"`
	`186`	`+ "CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out",`
	`187`	`+ "EXECUTORCH_ENABLE_LOGGING": "ON",`
	`188`	`+ "ET_MIN_LOG_LEVEL": "Info"`
`187`	`189`	`}`
`188`	`190`	`},`
`189`	`191`	`{`