[MPS] Fix unary/binary ops for 2**32+ elem tensors (pytorch#155183)

malfet · pytorchmergebot · commit dd41a3907cde · 2025-06-05T18:57:14.000Z
By using `TensorIterator::with_32bit_indexing()` primitive Add `bind_tensors` helper function that correctly sets up MPS tensors originating from TensorIterator TODO: Add comments to bind_tensors as well asunit test, based on ``` python -c "import torch;print((torch.rand(1, 1024, 1024, dtype=torch.bfloat16, device='mps') + torch.rand(5000, 1, 1, dtype=torch.bfloat16, device='mps')).sin())" ``` Fixes pytorch#154828 Pull Request resolved: pytorch#155183 Approved by: https://github.com/cyyever, https://github.com/dcci, https://github.com/Skylion007 ghstack dependencies: pytorch#155150, pytorch#155178, pytorch#155184
diff --git a/aten/src/ATen/native/mps/MetalShaderLibrary.h b/aten/src/ATen/native/mps/MetalShaderLibrary.h
@@ -154,6 +154,7 @@ class MetalShaderLibrary {
       MTLLibrary_t lib,
       const std::string& fname);
   MTLLibrary_t compileLibrary(const std::string& src);
+  void bind_tensors(MTLComputeCommandEncoder_t, TensorIteratorBase&);
   std::string shaderSource;
   unsigned nparams;
   MTLCompileOptions* compile_options;
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -971,10 +971,34 @@ static dispatch_data_t getSectionData(const std::string& name) {
   }
 };
 
+void MetalShaderLibrary::bind_tensors(id<MTLComputeCommandEncoder> encoder, TensorIteratorBase& iter) {
+  for (auto idx : c10::irange(iter.ntensors())) {
+    auto& t = iter.tensor_base(idx);
+    // Handle CPU scalars
+    if (C10_UNLIKELY(t.device().type() == kCPU)) {
+      mtl_setBuffer(encoder, t, idx);
+      continue;
+    }
+    // At the moment, MPS storage data is not the real GPU pointer, but rather a pointer to id<MTLBuffer> object
+    // But TensorIterator constructs data_ptr as if base was just a raw pointer
+    // Workaround this problem by computing an offset from the start of the tensor, which works for both
+    // tensor vies and sliced 64-bit iterators
+    auto offs = reinterpret_cast<size_t>(iter.data_ptr(idx)) - reinterpret_cast<size_t>(t.storage().data());
+    [encoder setBuffer:getMTLBufferStorage(t) offset:offs atIndex:idx];
+  }
+}
+
 void MetalShaderLibrary::exec_unary_kernel(TensorIteratorBase& iter,
                                            const std::string& name,
                                            std::optional<int64_t> extra) {
-  TORCH_CHECK(iter.can_use_32bit_indexing(), name, " can't be indexed using 32-bit iterator for shape ", iter.shape());
+  // Decompose 64-bit tensor into 32-bit ones
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto&& sub_iter : iter.with_32bit_indexing()) {
+      exec_unary_kernel(sub_iter, name, extra);
+    }
+    return;
+  }
+
   auto inputTensor = iter.input(0);
   auto outputTensor = iter.output(0);
   uint32_t length = iter.numel();
@@ -997,7 +1021,7 @@ static dispatch_data_t getSectionData(const std::string& name) {
       getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor});
 
       [computeEncoder setComputePipelineState:cplState];
-      mtl_setArgs(computeEncoder, outputTensor, inputTensor);
+      bind_tensors(computeEncoder, iter);
       if (!iter.is_contiguous()) {
         mtl_setArgs<2>(computeEncoder,
                        outputTensor.sizes(),
@@ -1022,13 +1046,20 @@ static dispatch_data_t getSectionData(const std::string& name) {
   // Right now running something like 1.0-torch.rand(5, device='mps') will create iterator with
   // double as common dtype (because Python floating point are always 64-bit values)
   TORCH_CHECK(iter.output().scalar_type() != at::kDouble, "float64 is not supported on MPS");
-  TORCH_CHECK(iter.can_use_32bit_indexing(), name, " can't be indexed using 32-bit iterator for shape ", iter.shape());
 
   // Skip for empty iterators
   if (iter.numel() == 0) {
     return;
   }
 
+  // Decompose 64-bit tensor into 32-bit ones
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto&& sub_iter : iter.with_32bit_indexing()) {
+      exec_binary_kernel(sub_iter, name, alpha);
+    }
+    return;
+  }
+
   auto convert_double_scalar = [](Tensor& t) {
     if (t.dim() != 0) {
       return;
@@ -1062,7 +1093,7 @@ static dispatch_data_t getSectionData(const std::string& name) {
       getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other});
       [computeEncoder setComputePipelineState:binaryPSO];
       // Set input and output tensors
-      mtl_setArgs(computeEncoder, out, input, other);
+      bind_tensors(computeEncoder, iter);
       // Iterator is contiguous if all of its elements are dense in storage,
       // i.e. it's true for both row-first and column-first tensors
       if (iter.is_contiguous()) {
diff --git a/test/test_mps.py b/test/test_mps.py
@@ -7955,6 +7955,20 @@ def test_inplace_bitwise_not(self, dtype):
             x[::2].bitwise_not_()
         self.assertEqual(x_mps.cpu(), x_cpu)
 
+
+class TestLargeTensors(TestCaseMPS):
+    def test_64bit_binops(self):
+        if torch.mps.recommended_max_memory() < 16_000_000_000:
+            raise unittest.SkipTest("Needs at least 16Gb of RAM")
+        a = torch.rand(1, 1024, 1024, dtype=torch.float16, device='mps')
+        b = torch.rand(5000, 1, 1, dtype=torch.float16, device='mps')
+        rc = (a + b).sin()
+        slice_idx = -2
+        rc_slice = rc[slice_idx:]
+        rc_slice_cpu = (a.cpu() + b.cpu()[slice_idx:]).sin()
+        self.assertEqual(rc_slice, rc_slice_cpu)
+
+
 class TestLogical(TestCaseMPS):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
         return torch.tensor(x, device=device, dtype=dtype, requires_grad=requires_grad)