[MTIA Runtime] Add foreach_div ops to native_functions.yaml (pytorch#162732)

nakuliyer · pytorchmergebot · commit e63476b2360f · 2025-09-17T17:44:03.000Z
Summary: Quick fix for runtime support on foreach_div, see D81274963. Fixed an issue that I created in that diff so that the CIs pass. Test Plan: CIs created in D81274963 and D81286593 pass. Added some logs in [aten_mtia_ops.py](https://www.internalfb.com/code/fbsource/[c56272ba042c43c65517dcac254364cf732fcfa9]/fbcode/mtia/host_runtime/torch_mtia/aten_mtia_ops.cpp?lines=3676) to all the foreach_div ops. We can see that the correct MTIA kernels are being invoked in the tests. https://www.internalfb.com/intern/testinfra/testrun/15481123829281588 Rollback Plan: Pull Request resolved: pytorch#162732 Approved by: https://github.com/danielhou0515
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -10699,13 +10699,15 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
     CUDA: foreach_tensor_div_list_kernel_cuda
+    MTIA: foreach_tensor_div_list_kernel_mtia
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
   variants: function
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
+    MTIA: foreach_tensor_div_list_kernel_mtia_
   autogen: _foreach_div.List_out
 
 - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10729,13 +10731,15 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
     CUDA: foreach_tensor_div_tensor_kernel_cuda
+    MTIA: foreach_tensor_div_tensor_kernel_mtia
 
 - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
   variants: function
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
     CUDA: foreach_tensor_div_tensor_kernel_cuda_
+    MTIA: foreach_tensor_div_tensor_kernel_mtia_
   autogen: _foreach_div.Tensor_out
 
 - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]