[MPS] Switch Cholesky decomp to column wise (pytorch#158237)

pytorchbot · malfet · web-flow · commit 9167ac8c7548 · 2025-07-22T15:25:46.000-07:00
[MPS] Switch Cholesky decomp to column wise (pytorch#157014) Everything should go thru a generalized kernels, and Metal kernels should work with the same sizes and strides as CPU or CUDA backends to avoid problems with `torch.compile` that relies on the meta kernels to tell what its ouput going to look like. To avoid returning tensors with different layout depending on whether upper parameter is true or false, templatize `factorDiagonalBlock`, `applyTRSM` and `applySYRK` to take upper/lower (actually row-wise vs column-wise) as template argument and call appropriate templates from host TODOs: - Rename upper parameter to something more sensible and add comments - Use simd_groupsize instead of hardcoded 32 everywhere Fixes pytorch#156658 Pull Request resolved: pytorch#157014 Approved by: https://github.com/Skylion007, https://github.com/dcci ghstack dependencies: pytorch#157179 (cherry picked from commit 1c8844d) Co-authored-by: Nikita Shulga <nshulga@meta.com>
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -697,7 +697,7 @@ TORCH_META_FUNC(linalg_cholesky_ex)(const Tensor& A,
   auto ndim = A_shape.size();
 
   // L
-  auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/A.device().type() != at::kMPS);
+  auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/true);
   set_output_strided(0, A_shape, L_strides, A.options(), {});
 
   // info
diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@@ -145,6 +145,28 @@ inline float blockReduceSum(
   return sharedScratch[0];
 }
 
+template <bool col_major>
+inline device float& get_ref(device float* A, uint row, uint col, uint N);
+
+template <>
+inline device float& get_ref<true>(
+    device float* A,
+    uint row,
+    uint col,
+    uint N) {
+  return A[row * N + col];
+}
+
+template <>
+inline device float& get_ref<false>(
+    device float* A,
+    uint row,
+    uint col,
+    uint N) {
+  return A[row + col * N];
+}
+
+template <bool upper>
 kernel void factorDiagonalBlock(
     device float* A [[buffer(0)]],
     device int* info [[buffer(1)]],
@@ -171,7 +193,7 @@ kernel void factorDiagonalBlock(
   for (uint i = linear_tid; i < tileSize; i += group_size) {
     uint r = i / actSize;
     uint c = i % actSize;
-    tile[r][c] = A[batch_offset + (row0 + r) * N + (col0 + c)];
+    tile[r][c] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
   }
   threadgroup_barrier(mem_flags::mem_threadgroup);
 
@@ -244,10 +266,33 @@ kernel void factorDiagonalBlock(
   for (uint i = linear_tid; i < tileSize; i += group_size) {
     uint r = i / actSize;
     uint c = i % actSize;
-    A[batch_offset + (row0 + r) * N + (col0 + c)] = tile[r][c];
+    get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = tile[r][c];
   }
 }
 
+template [[host_name("factorDiagonalBlockU")]]
+kernel void factorDiagonalBlock<true>(
+    device float* A [[buffer(0)]],
+    device int* info [[buffer(1)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 bid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template [[host_name("factorDiagonalBlockL")]]
+kernel void factorDiagonalBlock<false>(
+    device float* A [[buffer(0)]],
+    device int* info [[buffer(1)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 bid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template <bool upper>
 kernel void applyTRSM(
     device float* A [[buffer(0)]],
     constant uint& N [[buffer(2)]],
@@ -283,12 +328,12 @@ kernel void applyTRSM(
   for (uint i = linear_tid; i < actSize_k * actSize_k; i += group_size) {
     uint r = i / actSize_k;
     uint c = i % actSize_k;
-    diag[i] = A[batch_offset + (k * NB + r) * N + (k * NB + c)];
+    diag[i] = get_ref<upper>(A + batch_offset, k * NB + r, k * NB + c, N);
   }
   for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
     uint r = i / actSize_k;
     uint c = i % actSize_k;
-    target[i] = A[batch_offset + (row0 + r) * N + (col0 + c)];
+    target[i] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
   }
   threadgroup_barrier(mem_flags::mem_threadgroup);
 
@@ -332,10 +377,31 @@ kernel void applyTRSM(
   for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
     uint r = i / actSize_k;
     uint c = i % actSize_k;
-    A[batch_offset + (row0 + r) * N + (col0 + c)] = target[i];
+    get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = target[i];
   }
 }
 
+template [[host_name("applyTRSMU")]]
+kernel void applyTRSM<true>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template [[host_name("applyTRSML")]]
+kernel void applyTRSM<false>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]]);
+
+template <bool upper>
 kernel void applySYRK(
     device float* A [[buffer(0)]],
     constant uint& N [[buffer(2)]],
@@ -403,25 +469,37 @@ kernel void applySYRK(
       // Same logic to load/store Cfrag, Afrag, Bfrag...
       simdgroup_matrix<float, 8, 8> Cfrag;
       simdgroup_load(
-          Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
+          Cfrag,
+          &get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
+          N,
+          0,
+          !upper);
 
       for (uint kk = 0; kk < actSize_k; kk += 8) {
         simdgroup_load(
-            Afrag, &A[batch_offset + (row0 + sb_y) * N + (k * NB + kk)], N);
+            Afrag,
+            &get_ref<upper>(A + batch_offset, row0 + sb_y, k * NB + kk, N),
+            N,
+            0,
+            !upper);
         simdgroup_load(
             Bfrag,
-            &A[batch_offset + (col0 + sb_x) * N + (k * NB + kk)],
+            &get_ref<upper>(A + batch_offset, col0 + sb_x, k * NB + kk, N),
             N,
             /* matrix_origin = */ 0,
-            /* transpose = */ true);
+            /* transpose = */ upper);
 
         simdgroup_multiply(Prod, Afrag, Bfrag);
         simdgroup_multiply(Prod, Prod, negative_identity);
         simdgroup_multiply_accumulate(Cfrag, Cfrag, identity, Prod);
       }
 
       simdgroup_store(
-          Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
+          Cfrag,
+          &get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
+          N,
+          0,
+          !upper);
     }
   } else {
     // Fallback for non-multiple-of-8 dimensions
@@ -442,8 +520,10 @@ kernel void applySYRK(
 
         float sum = 0.0f;
         for (uint i = 0; i < actSize_k; i++) {
-          float a_val = A[batch_offset + (row0 + y) * N + k * NB + i];
-          float b_val = A[batch_offset + (col0 + x) * N + k * NB + i];
+          float a_val =
+              get_ref<upper>(A + batch_offset, row0 + y, k * NB + i, N);
+          float b_val =
+              get_ref<upper>(A + batch_offset, col0 + x, k * NB + i, N);
           sum = fma(a_val, b_val, sum);
         }
         sum_accumulator[y * tpg.x + x] += sum;
@@ -452,13 +532,35 @@ kernel void applySYRK(
     threadgroup_barrier(mem_flags::mem_threadgroup);
     for (uint y = ty; y < actSize_j; y += tpg.y) {
       for (uint x = tx; x < actSize_h; x += tpg.x) {
-        A[batch_offset + (row0 + y) * N + col0 + x] -=
+        get_ref<upper>(A + batch_offset, row0 + y, col0 + x, N) -=
             sum_accumulator[y * tpg.x + x];
       }
     }
   }
 }
 
+template [[host_name("applySYRKU")]]
+kernel void applySYRK<true>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]],
+    uint sgitg [[simdgroup_index_in_threadgroup]]);
+
+template [[host_name("applySYRKL")]]
+kernel void applySYRK<false>(
+    device float* A [[buffer(0)]],
+    constant uint& N [[buffer(2)]],
+    constant uint& NB [[buffer(3)]],
+    constant uint& k [[buffer(4)]],
+    uint3 tid [[thread_position_in_threadgroup]],
+    uint3 tgid [[threadgroup_position_in_grid]],
+    uint3 tpg [[threads_per_threadgroup]],
+    uint sgitg [[simdgroup_index_in_threadgroup]]);
+
 kernel void applyPivots(
     device float* P [[buffer(0)]],
     device const int* pivots [[buffer(1)]],
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -2,6 +2,7 @@
 
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/Resize.h>
@@ -22,7 +23,6 @@
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
-#include <ATen/ops/linalg_cholesky_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_native.h>
@@ -1097,25 +1097,8 @@ static void lu_unpack_mps_impl(const Tensor& LU_data,
   }
 }
 
-static void linalg_cholesky_mps_impl(const Tensor& input,
-                                     bool upper,
-                                     bool check_errors,
-                                     const Tensor& out,
-                                     const Tensor& info) {
-  using namespace mps;
-
-  TORCH_CHECK(out.is_mps());
-  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float, "linalg.cholesky: Input tensor must be float32");
-  TORCH_CHECK(input.dim() >= 2, "linalg.cholesky: Input tensor must be at least 2D");
-  TORCH_CHECK(input.size(-2) == input.size(-1), "linalg.cholesky: Input tensor must be square");
-  auto input_sizes = input.sizes();
-  resize_output(out, input_sizes);
-  resize_output(info, {input_sizes.begin(), input_sizes.end() - 2});
-  if (input.numel() == 0) {
-    info.zero_();
-    return;
-  }
-  out.copy_(input);
+static void cholesky_stub_impl(const Tensor& out, const Tensor& info, bool upper) {
+  auto input_sizes = out.sizes();
 
   int64_t ndim = out.dim();
   int64_t N = out.size(-1);
@@ -1124,9 +1107,9 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
   auto stream = getCurrentMPSStream();
   auto device = MPSDevice::getInstance()->device();
 
-  auto factorDiagonalPSO = lib.getPipelineStateForFunc("factorDiagonalBlock");
-  auto applyTRSMPSO = lib.getPipelineStateForFunc("applyTRSM");
-  auto applySYRKPSO = lib.getPipelineStateForFunc("applySYRK");
+  auto factorDiagonalPSO = lib.getPipelineStateForFunc(upper ? "factorDiagonalBlockU" : "factorDiagonalBlockL");
+  auto applyTRSMPSO = lib.getPipelineStateForFunc(upper ? "applyTRSMU" : "applyTRSML");
+  auto applySYRKPSO = lib.getPipelineStateForFunc(upper ? "applySYRKU" : "applySYRKL");
 
   int64_t NB = std::min<int64_t>(32, N);
   int64_t numBlocks = (N + NB - 1) / NB;
@@ -1168,33 +1151,8 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
       }
     });
   }
-  int status;
-  if (check_errors) {
-    if (info_.dim() > 0) {
-      // batch case
-      for (const auto i : c10::irange(B)) {
-        status = info_[i].item<int>();
-        TORCH_CHECK(
-            status == 0,
-            "linalg.cholesky(): (Batch element ",
-            i,
-            "):  The factorization could not be completed because the input is not positive-definite (the leading minor of order ",
-            status,
-            " is not positive-definite).");
-      }
-    } else {
-      // single matrix case(no batch size)
-      status = info.item<int>();
-      TORCH_CHECK(
-          status == 0,
-          "linalg.cholesky(): The factorization could not be completed because the input is not positive-definite (the leading minor of order ",
-          status,
-          " is not positive-definite).");
-    }
-  }
-  out.tril_();
-  upper ? out.transpose_(ndim - 2, ndim - 1) : out;
 }
+
 } // namespace mps
 
 Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, const Scalar& beta, const Scalar& alpha) {
@@ -1355,23 +1313,6 @@ Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, cons
   return result;
 }
 
-Tensor cholesky_mps(const Tensor& self, bool upper) {
-  auto out = at::empty_like(self, MemoryFormat::Contiguous);
-  cholesky_mps_out(self, upper, out);
-  return out;
-}
-
-Tensor& cholesky_mps_out(const Tensor& self, bool upper, Tensor& out) {
-  auto info = at::empty({}, self.options().dtype(kInt));
-  mps::linalg_cholesky_mps_impl(self, upper, true, out, info);
-  return out;
-}
-
-TORCH_IMPL_FUNC(linalg_cholesky_ex_out_mps)
-(const Tensor& self, bool upper, bool check_errors, const Tensor& L, const Tensor& info) {
-  mps::linalg_cholesky_mps_impl(self, upper, check_errors, L, info);
-}
-
 Tensor addbmm_mps(const Tensor& self,
                   const Tensor& batch1,
                   const Tensor& batch2,
@@ -1460,4 +1401,6 @@ Tensor linalg_solve_triangular_mps(const Tensor& A, const Tensor& B, bool upper,
 TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info) {
   mps::linalg_inv_ex_out_mps_impl(A, check_errors, result, info);
 }
+
+REGISTER_DISPATCH(cholesky_stub, mps::cholesky_stub_impl)
 } // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -9476,14 +9476,12 @@
 
 - func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: cholesky_out
-    MPS: cholesky_mps_out
+    CPU, CUDA, MPS: cholesky_out
 
 - func: cholesky(Tensor self, bool upper=False) -> Tensor
   variants: method, function
   dispatch:
-    CPU, CUDA: cholesky
-    MPS: cholesky_mps
+    CPU, CUDA, MPS: cholesky
 
 - func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13935,8 +13933,7 @@
   python_module: linalg
   structured: True
   dispatch:
-    CPU, CUDA: linalg_cholesky_ex_out
-    MPS: linalg_cholesky_ex_out_mps
+    CPU, CUDA, MPS: linalg_cholesky_ex_out
 
 - func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
   python_module: linalg
diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py
@@ -180,6 +180,15 @@ def fn(x, y):
             ),
         )
 
+    def test_cholesky(self):
+        def fn(x):
+            return (
+                torch.linalg.cholesky(x, upper=False),
+                torch.linalg.cholesky(x, upper=True),
+            )
+
+        self.common(fn, (torch.eye(64),), check_lowp=False)
+
 
 class MPSBasicTestsAOTI(TestCase):
     def check_model(self, m, inp, dynamic_shapes=None):