[MPS] Migrate round unary op to Metal (pytorch#161712)

malfet · pytorchmergebot · commit 2042d2174acc · 2025-08-28T16:45:07.000Z
And actually use the right function, as [`torch.round`](https://docs.pytorch.org/docs/stable/generated/torch.round.html) doesn't use `std::round`, but rather `std::rint`, which can be easily seen by running something like ```python import torch print(torch.arange(-3., 3., step=.5, device='mps').round()) print(torch.arange(-3., 3., step=.5, device='mps').cpu().round()) ``` Before this change it printed ``` tensor([-3., -3., -2., -2., -1., -1., 0., 1., 1., 2., 2., 3.], device='mps:0') tensor([-3., -2., -2., -2., -1., -0., 0., 0., 1., 2., 2., 2.]) ``` But after this change results match Pull Request resolved: pytorch#161712 Approved by: https://github.com/dcci
diff --git a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
@@ -503,6 +503,17 @@ struct round_decimals_functor {
   }
 };
 
+struct round_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return static_cast<T>(rint(float(x)));
+  }
+  template <typename T, enable_if_t<is_scalar_integral_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return x;
+  }
+};
+
 DEFINE_UNARY_FLOATING_FUNCTOR(erf);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfc);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
@@ -515,6 +526,13 @@ REGISTER_UNARY_OP(neg, char, char);
 REGISTER_UNARY_OP(neg, uchar, uchar);
 REGISTER_UNARY_OP(neg, float, float);
 REGISTER_UNARY_OP(neg, half, half);
+REGISTER_UNARY_OP(round, int, int);
+REGISTER_UNARY_OP(round, long, long);
+REGISTER_UNARY_OP(round, short, short);
+REGISTER_UNARY_OP(round, char, char);
+REGISTER_UNARY_OP(round, uchar, uchar);
+REGISTER_UNARY_OP(round, float, float);
+REGISTER_UNARY_OP(round, half, half);
 
 REGISTER_UNARY_OP(bitwise_not, int, int);
 REGISTER_UNARY_OP(bitwise_not, long, long);
@@ -558,6 +576,7 @@ REGISTER_UNARY_OP(abs, half, half);
 
 INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
 REGISTER_UNARY_OP(neg, bfloat, bfloat);
+REGISTER_UNARY_OP(round, bfloat, bfloat);
 REGISTER_UNARY_OP(abs, bfloat, bfloat);
 INSTANTIATE_UNARY_KERNELS2(half, half);
 INSTANTIATE_UNARY_KERNELS2(float, float);
diff --git a/aten/src/ATen/native/mps/operations/UnaryKernel.mm b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@@ -50,6 +50,7 @@ static void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals) {
 REGISTER_UNARY_TI_DISPATCH(log);
 REGISTER_UNARY_TI_DISPATCH(log1p);
 REGISTER_UNARY_TI_DISPATCH(bitwise_not);
+REGISTER_UNARY_TI_DISPATCH(round);
 REGISTER_UNARY_TI_DISPATCH(sigmoid);
 REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel);
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -184,7 +184,6 @@ static void unary_op(const Tensor& self,
 
 REGISTER_MPS_UNARY_STUB(ceil, ceil);
 REGISTER_MPS_UNARY_STUB(floor, floor);
-REGISTER_MPS_UNARY_STUB(round, round);
 REGISTER_MPS_UNARY_STUB(trunc, truncate);
 
 #define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub)                                         \
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
@@ -366,7 +366,7 @@ def randint64(
 
     @staticmethod
     def round(x: CSEVariable) -> str:
-        return f"metal::round({x})"
+        return f"metal::rint({x})"
 
     @staticmethod
     def pow(a: CSEVariable, b: CSEVariable) -> str:
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
@@ -439,9 +439,6 @@ def mps_ops_modifier(
                 torch.uint8,
                 torch.int8,
             ],
-            # round not working properly for float16 and bfloat16
-            "round": [torch.float16, torch.bfloat16],
-            "rounddecimals_0": [torch.bfloat16],
         }
 
         if MACOS_VERSION < 15.0:
@@ -725,8 +722,6 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "signal.windows.kaiser": [torch.float32],
             "signal.windows.nuttall": [torch.float32],
             "eye": [torch.float16, torch.float32],
-            # round not working properly for float16
-            "round": [torch.float16],
             # topk fails with duplicate indices
             "topk": [torch.float16],
         }