[MPS] Reimplement tri[ul] as Metal shaders (pytorch#158867)

pytorchbot · malfet · tvukovic-amd · commit d00758893d54 · 2025-08-20T20:19:00.000+02:00
[MPS] Reimplement `tri[ul]` as Metal shaders (pytorch#157179) And add in-place flavor, as it is currently broken for non-contig tensors Pull Request resolved: pytorch#157179 Approved by: https://github.com/dcci (cherry picked from commit a1e4f1f) Co-authored-by: Nikita Shulga <nikita.shulga@gmail.com>
diff --git a/aten/src/ATen/native/mps/kernels/TriangularOps.metal b/aten/src/ATen/native/mps/kernels/TriangularOps.metal
@@ -1,5 +1,119 @@
 #include <metal_stdlib>
+
 using namespace metal;
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triu/tril ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+template <bool upper>
+inline bool triul_mask(int row, int col, int k);
+template <>
+inline bool triul_mask<true>(int row, int col, int k) {
+  return col - row >= k;
+}
+template <>
+inline bool triul_mask<false>(int row, int col, int k) {
+  return col - row <= k;
+}
+
+template <typename IndexType>
+inline IndexType compute_offs(
+    constant IndexType* strides,
+    constant uint* sizes,
+    uint3 pos,
+    int ndim) {
+  auto offs = pos.x * strides[0] + pos.y * strides[1];
+  if (ndim < 4) {
+    return ndim == 3 ? offs + pos.z * strides[2] : offs;
+  }
+  auto idx = pos.z;
+  for (int i = 2; i < ndim; ++i) {
+    offs += strides[i] * (idx % sizes[i]);
+    idx /= sizes[i];
+  }
+  return offs;
+}
+
+template <typename T, typename IndexType, bool upper>
+kernel void triul_inplace(
+    device T* self,
+    constant IndexType* strides,
+    constant uint* sizes,
+    constant int2& k_ndim,
+    uint3 pos [[thread_position_in_grid]]) {
+  if (triul_mask<upper>(pos.y, pos.x, k_ndim.x)) {
+    return;
+  }
+  auto offs = compute_offs(strides, sizes, pos, k_ndim.y);
+  self[offs] = 0;
+}
+
+template <typename T, typename IndexType, bool upper>
+kernel void triul(
+    device T* out,
+    device T* inp,
+    constant IndexType* out_strides,
+    constant IndexType* inp_strides,
+    constant uint* sizes,
+    constant int2& k_ndim,
+    uint3 pos [[thread_position_in_grid]]) {
+  auto out_offs = compute_offs(out_strides, sizes, pos, k_ndim.y);
+  if (!triul_mask<upper>(pos.y, pos.x, k_ndim.x)) {
+    out[out_offs] = 0;
+    return;
+  }
+  auto inp_offs = compute_offs(inp_strides, sizes, pos, k_ndim.y);
+  out[out_offs] = inp[inp_offs];
+}
+
+#define INSTANTIATE_TRIUL_KERNELS(DTYPE, IDX_TYPE)                         \
+  template [[host_name("triu_inplace_" #IDX_TYPE "_" #DTYPE)]] kernel void \
+  triul_inplace<DTYPE, IDX_TYPE, true>(                                    \
+      device DTYPE * self,                                                 \
+      constant IDX_TYPE * strides,                                         \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]]);                              \
+  template [[host_name("tril_inplace_" #IDX_TYPE "_" #DTYPE)]] kernel void \
+  triul_inplace<DTYPE, IDX_TYPE, false>(                                   \
+      device DTYPE * self,                                                 \
+      constant IDX_TYPE * strides,                                         \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]]);                              \
+  template [[host_name("triu_" #IDX_TYPE "_" #DTYPE)]] kernel void         \
+  triul<DTYPE, IDX_TYPE, true>(                                            \
+      device DTYPE * out,                                                  \
+      device DTYPE * inp,                                                  \
+      constant IDX_TYPE * out_strides,                                     \
+      constant IDX_TYPE * inp_strides,                                     \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]]);                              \
+  template [[host_name("tril_" #IDX_TYPE "_" #DTYPE)]] kernel void         \
+  triul<DTYPE, IDX_TYPE, false>(                                           \
+      device DTYPE * out,                                                  \
+      device DTYPE * inp,                                                  \
+      constant IDX_TYPE * out_strides,                                     \
+      constant IDX_TYPE * inp_strides,                                     \
+      constant uint * sizes,                                               \
+      constant int2 & k_ndim,                                              \
+      uint3 pos [[thread_position_in_grid]])
+
+INSTANTIATE_TRIUL_KERNELS(float, int);
+INSTANTIATE_TRIUL_KERNELS(half, int);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_TRIUL_KERNELS(bfloat, int);
+#endif
+
+INSTANTIATE_TRIUL_KERNELS(float2, int);
+INSTANTIATE_TRIUL_KERNELS(half2, int);
+
+INSTANTIATE_TRIUL_KERNELS(long, int);
+INSTANTIATE_TRIUL_KERNELS(int, int);
+INSTANTIATE_TRIUL_KERNELS(short, int);
+INSTANTIATE_TRIUL_KERNELS(char, int);
+INSTANTIATE_TRIUL_KERNELS(uchar, int);
+INSTANTIATE_TRIUL_KERNELS(bool, int);
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 // To find the max integer that does not exceed the root of an int64_t variable,
diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm
@@ -5,6 +5,7 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/TensorFactories.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <fmt/format.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -26,101 +27,53 @@
 #include <ATen/native/mps/TriangularOps_metallib.h>
 #endif
 
-TORCH_IMPL_FUNC(triu_mps_out)
-(const Tensor& self, int64_t k, const Tensor& output) {
-  using namespace mps;
-  using CachedGraph = MPSUnaryCachedGraph;
-
-  if (self.numel() == 0) {
-    return;
-  }
-  auto stream = getCurrentMPSStream();
-
-  @autoreleasepool {
-    std::string key = "triu_mps_out" + mps::getTensorsStringKey({self}) + ":" + std::to_string(k);
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* outputTensor = nil;
-      auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-
-      auto minusOneTensor = [mpsGraph constantWithScalar:-1 dataType:MPSDataTypeInt32];
-
-      if (k > 0) {
-        auto diagMinusOneTensor = [mpsGraph constantWithScalar:(k - 1) dataType:MPSDataTypeInt32];
-        auto onesTensor = [mpsGraph constantWithScalar:1 shape:inputTensor.shape dataType:MPSDataTypeInt32];
-        auto maskTensor = [mpsGraph bandPartWithTensor:onesTensor
-                                        numLowerTensor:minusOneTensor
-                                        numUpperTensor:diagMinusOneTensor
-                                                  name:nil];
-        outputTensor = [mpsGraph selectWithPredicateTensor:maskTensor
-                                       truePredicateTensor:[mpsGraph constantWithScalar:0 dataType:inputTensor.dataType]
-                                      falsePredicateTensor:inputTensor
-                                                      name:nil];
-      } else {
-        auto minusDiagTensor = [mpsGraph constantWithScalar:(-k) dataType:MPSDataTypeInt32];
-        outputTensor = [mpsGraph bandPartWithTensor:inputTensor
-                                     numLowerTensor:minusDiagTensor
-                                     numUpperTensor:minusOneTensor
-                                               name:nil];
-      }
-
-      newCachedGraph->inputTensor_ = inputTensor;
-      newCachedGraph->outputTensor_ = outputTensor;
-    });
-
-    auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-    runMPSGraph(stream, cachedGraph->graph(), dictionaryFromPlaceholders(selfPlaceholder), outputPlaceholder);
+template <typename T>
+static std::vector<T> reverse_array(const IntArrayRef& arr) {
+  std::vector<T> rc(arr.size());
+  for (const auto& i : c10::irange(arr.size())) {
+    rc[i] = arr[arr.size() - 1 - i];
   }
+  return rc;
 }
 
-TORCH_IMPL_FUNC(tril_mps_out)
-(const Tensor& self, int64_t k, const Tensor& output) {
+static void triu_tril_impl(const Tensor& self, int64_t k, const Tensor& out, const std::string& name) {
   using namespace mps;
-  using CachedGraph = MPSUnaryCachedGraph;
-
   if (self.numel() == 0) {
     return;
   }
-
+  auto sizes = reverse_array<uint32_t>(self.sizes());
+  auto inp_strides = reverse_array<int32_t>(self.strides());
+  auto out_strides = reverse_array<int32_t>(out.strides());
+  std::array<int, 2> k_ndim = {int(k), int(self.ndimension())};
+  const bool inplace = self.is_same(out);
+  const auto kernel_name =
+      fmt::format("{}{}_{}_{}", name, inplace ? "_inplace" : "", "int", scalarToMetalTypeString(self));
+  auto triuPSO = lib.getPipelineStateForFunc(kernel_name);
+  uint32_t max_threads_per_group = [triuPSO maxTotalThreadsPerThreadgroup];
   auto stream = getCurrentMPSStream();
-
-  @autoreleasepool {
-    std::string key = "tril_mps_out" + mps::getTensorsStringKey({self}) + ":" + std::to_string(k);
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* outputTensor = nil;
-
-      auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-      auto minusOneTensor = [mpsGraph constantWithScalar:-1 dataType:MPSDataTypeInt32];
-
-      if (k >= 0) {
-        auto diagTensor = [mpsGraph constantWithScalar:k dataType:MPSDataTypeInt32];
-        outputTensor = [mpsGraph bandPartWithTensor:inputTensor
-                                     numLowerTensor:minusOneTensor
-                                     numUpperTensor:diagTensor
-                                               name:nil];
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = stream->commandEncoder();
+      [computeEncoder setComputePipelineState:triuPSO];
+      if (inplace) {
+        mtl_setArgs(computeEncoder, self, inp_strides, sizes, k_ndim);
       } else {
-        auto negDiagMinusOneTensor = [mpsGraph constantWithScalar:(-k - 1) dataType:MPSDataTypeInt32];
-        auto complementTensor = [mpsGraph bandPartWithTensor:inputTensor
-                                              numLowerTensor:negDiagMinusOneTensor
-                                              numUpperTensor:minusOneTensor
-                                                        name:nil];
-        auto zeroTensor = [mpsGraph constantWithScalar:0.0 dataType:getMPSDataType(self)];
-        auto mask = [mpsGraph equalWithPrimaryTensor:complementTensor secondaryTensor:zeroTensor name:nil];
-        outputTensor = [mpsGraph selectWithPredicateTensor:mask
-                                       truePredicateTensor:inputTensor
-                                      falsePredicateTensor:zeroTensor
-                                                      name:nil];
+        mtl_setArgs(computeEncoder, out, self, out_strides, inp_strides, sizes, k_ndim);
       }
+      [computeEncoder dispatchThreads:MTLSizeMake(sizes[0], sizes[1], self.numel() / (sizes[0] * sizes[1]))
+                threadsPerThreadgroup:MTLSizeMake(std::min(max_threads_per_group, sizes[0]), 1, 1)];
+    }
+  });
+}
 
-      newCachedGraph->inputTensor_ = inputTensor;
-      newCachedGraph->outputTensor_ = outputTensor;
-    });
-
-    auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+TORCH_IMPL_FUNC(triu_mps_out)
+(const Tensor& self, int64_t k, const Tensor& output) {
+  triu_tril_impl(self, k, output, "triu");
+}
 
-    runMPSGraph(stream, cachedGraph->graph(), dictionaryFromPlaceholders(selfPlaceholder), outputPlaceholder);
-  }
+TORCH_IMPL_FUNC(tril_mps_out)
+(const Tensor& self, int64_t k, const Tensor& output) {
+  triu_tril_impl(self, k, output, "tril");
 }
 
 Tensor tril_indices_mps(int64_t row,
diff --git a/test/test_mps.py b/test/test_mps.py
@@ -7146,6 +7146,11 @@ def helper(shape, diag=0):
         helper((2, 8, 4, 5), diag=-1)
         helper((2, 8, 4, 5), diag=-2)
         helper((2, 8, 4, 5), diag=-3)
+        # Test inplace
+        x_mps = torch.arange(9.0, device='mps').reshape(3, 3).t().triu()
+        x_cpu = torch.arange(9.0, device='cpu').reshape(3, 3).t().triu()
+        self.assertEqual(x_cpu, x_mps)
+        self.assertEqual(x_cpu.stride(), x_mps.stride())
 
     # Test inverse
     def test_inverse(self):
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
@@ -157,6 +157,8 @@ def mps_ops_modifier(
             "tensor_split",
             "transpose",
             "transpose_copy",
+            "tril",
+            "triu",
             "true_divide",
             "T",
             "unbind",
@@ -283,8 +285,6 @@ def mps_ops_modifier(
             "trace",
             "trapz",
             "trapezoid",
-            "tril",
-            "triu",
             "vstack",
             "where",
             "byte",