Add Template Parameter to gpu_kernel for Controlling Broadcasting Vectorization (#1873)

chunhuanMeng · Copilot · xytintel · web-flow · commit f8b1ee951715 · 2025-07-23T06:23:05.000Z
This pull request updates the `gpu_kernel` function in `src/ATen/native/xpu/sycl/Loops.h` to introduce a new template parameter for better control over broadcasting behavior. The changes ensure that the broadcasting vectorization can be enabled or disabled explicitly during function calls. ### Enhancements to `gpu_kernel`: * Added a new template parameter `enable_broadcast_vec` (defaulting to `true`) to the `gpu_kernel` function, allowing explicit control over broadcasting vectorization. * Updated recursive and implementation calls within `gpu_kernel` to pass the `enable_broadcast_vec` parameter, ensuring consistent behavior during sub-iteration and implementation. ### Reason for the changes: The reason for introducing the `enable_broadcast_vec` parameter is to address an issue with the output offset calculation when the iterator (`iter`) is split. When broadcasting vectorization is enabled (`enable_broadcast_vec` is `true`), the path taken during the computation can lead to incorrect output offsets after the iterator has been split. By allowing explicit control over broadcasting vectorization, we can disable it in scenarios where the iterator has been split, thereby ensuring correct output offset calculations. Resolve #1813 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yutao Xu <yutao.xu@intel.com>
diff --git a/src/ATen/native/xpu/sycl/Loops.h b/src/ATen/native/xpu/sycl/Loops.h
@@ -620,7 +620,7 @@ void gpu_kernel_nocast(TensorIteratorBase& iter, const func_t& f) {
   gpu_kernel_impl_nocast(iter, f);
 }
 
-template <typename func_t>
+template <typename func_t, bool enable_broadcast_vec = true>
 void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {
   for (int arg = 0; arg < iter.ntensors(); arg++) {
     TORCH_INTERNAL_ASSERT(
@@ -637,12 +637,14 @@ void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {
 
   if (!iter.can_use_32bit_indexing()) {
     for (auto& sub_iter : iter.with_32bit_indexing()) {
-      gpu_kernel(sub_iter, f);
+      // Broadcasting vectorization is disabled for sub-iterators to prevent
+      // potential output offset calculation issues.
+      gpu_kernel<func_t, false>(sub_iter, f);
     }
     return;
   }
 
-  gpu_kernel_impl(iter, f);
+  gpu_kernel_impl<func_t, enable_broadcast_vec>(iter, f);
 }
 
 template <typename arg1_t, typename arg2_t, typename return_t, typename func_t>
diff --git a/test/regressions/test_loops.py b/test/regressions/test_loops.py
@@ -70,3 +70,8 @@ def test_loops_dynamic_cast(self):
             c = a + b + 1
             c_xpu = a_xpu + b_xpu + 1
             self.assertEqual(c, c_xpu.cpu())
+
+    def test_bc_vec_large_tensor(self):
+        raw_data = torch.rand(48, 64, 64, 64, 64)
+        a = raw_data.xpu().transpose(0, 1).contiguous().transpose(0, 1)
+        self.assertEqual(a, raw_data.xpu())

Original file line number	Diff line number	Diff line change
`@@ -620,7 +620,7 @@ void gpu_kernel_nocast(TensorIteratorBase& iter, const func_t& f) {`
`620`	`620`	`gpu_kernel_impl_nocast(iter, f);`
`621`	`621`	`}`
`622`	`622`
`623`		`-template <typename func_t>`
	`623`	`+template <typename func_t, bool enable_broadcast_vec = true>`
`624`	`624`	`void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {`
`625`	`625`	`for (int arg = 0; arg < iter.ntensors(); arg++) {`
`626`	`626`	`TORCH_INTERNAL_ASSERT(`
`@@ -637,12 +637,14 @@ void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {`
`637`	`637`
`638`	`638`	`if (!iter.can_use_32bit_indexing()) {`
`639`	`639`	`for (auto& sub_iter : iter.with_32bit_indexing()) {`
`640`		`- gpu_kernel(sub_iter, f);`
	`640`	`+ // Broadcasting vectorization is disabled for sub-iterators to prevent`
	`641`	`+ // potential output offset calculation issues.`
	`642`	`+ gpu_kernel<func_t, false>(sub_iter, f);`
`641`	`643`	`}`
`642`	`644`	`return;`
`643`	`645`	`}`
`644`	`646`
`645`		`- gpu_kernel_impl(iter, f);`
	`647`	`+ gpu_kernel_impl<func_t, enable_broadcast_vec>(iter, f);`
`646`	`648`	`}`
`647`	`649`
`648`	`650`	`template <typename arg1_t, typename arg2_t, typename return_t, typename func_t>`