Add aten::foreach_sub and its variants (#1034)

cfgfung · toyxu · web-flow · commit 5e2983143e14 · 2024-11-04T09:17:06.000Z
Add the following operators:

- foreach_sub.List
- foreach_sub.Scalar
- foreach_sub.ScalarList
- foreach_sub_.List
- foreach_sub_.Scalar
- foreach_sub_.ScalarList

---------

Co-authored-by: Yutao Xu &lt;yutao.xu@intel.com&gt;
diff --git a/src/ATen/native/xpu/AiryAi.cpp b/src/ATen/native/xpu/AiryAi.cpp
@@ -1,6 +1,6 @@
-#include <ATen/native/UnaryOps.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
 #include <ATen/native/xpu/sycl/AiryAiKernel.h>
 
 namespace at {
diff --git a/src/ATen/native/xpu/BinaryOps.cpp b/src/ATen/native/xpu/BinaryOps.cpp
@@ -17,9 +17,9 @@
 #include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
 #include <ATen/native/xpu/sycl/CopysignKernel.h>
 #include <ATen/native/xpu/sycl/GcdLcmKernels.h>
-#include <ATen/native/xpu/sycl/IGammaKernel.h>
 #include <ATen/native/xpu/sycl/HermitePolynomialHKernel.h>
 #include <ATen/native/xpu/sycl/HermitePolynomialHeKernel.h>
+#include <ATen/native/xpu/sycl/IGammaKernel.h>
 #include <ATen/native/xpu/sycl/LaguerrePolynomialLKernel.h>
 #include <ATen/native/xpu/sycl/LegendrePolynomialPKernel.h>
 #include <ATen/native/xpu/sycl/LogAddExpKernels.h>
diff --git a/src/ATen/native/xpu/Embedding.cpp b/src/ATen/native/xpu/Embedding.cpp
@@ -35,6 +35,5 @@ Tensor& embedding_renorm_xpu_(
       self, indices, max_norm, norm_type);
 }
 
-
 } // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/ForeachOpList.cpp b/src/ATen/native/xpu/ForeachOpList.cpp
@@ -4,17 +4,17 @@
 #include <ATen/ops/_foreach_addcmul_native.h>
 #include <ATen/ops/_foreach_clamp_max_native.h>
 #include <ATen/ops/_foreach_clamp_min_native.h>
+#include <ATen/ops/_foreach_copy_native.h>
 #include <ATen/ops/_foreach_div_native.h>
 #include <ATen/ops/_foreach_lerp_native.h>
 #include <ATen/ops/_foreach_mul_native.h>
-#include <ATen/ops/_foreach_clamp_min_native.h>
-#include <ATen/ops/_foreach_copy_native.h>
 #include <ATen/ops/_foreach_pow_native.h>
+#include <ATen/ops/_foreach_sub_native.h>
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h>
+#include <ATen/native/xpu/sycl/ForeachCopyKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h>
-#include <ATen/native/xpu/sycl/ForeachCopyKernels.h>
 
 #include <ATen/ops/empty_like.h>
 
@@ -68,6 +68,7 @@ namespace native {
   }
 
 FOREACH_BINARY_OP_LIST_ALPHA(add);
+FOREACH_BINARY_OP_LIST_ALPHA(sub);
 FOREACH_BINARY_OP_LIST(mul, false);
 FOREACH_BINARY_OP_LIST(div, true);
 FOREACH_BINARY_OP_LIST(clamp_max, true);
@@ -154,12 +155,11 @@ void foreach_tensor_copy_list_kernel_xpu_(
     TensorList self,
     TensorList src,
     bool non_blocking) {
-      check_foreach_api_restrictions(self, src);
-      if (!can_use_fast_route(
-              self, src, /* does_op_promote_integer_inputs_to_float */ false)) {
-        return foreach_tensor_copy_list_kernel_slow_(
-            self, src, non_blocking);
-    }
+  check_foreach_api_restrictions(self, src);
+  if (!can_use_fast_route(
+          self, src, /* does_op_promote_integer_inputs_to_float */ false)) {
+    return foreach_tensor_copy_list_kernel_slow_(self, src, non_blocking);
+  }
 
   xpu::foreach_copy_list_kernel_(self, src);
 
diff --git a/src/ATen/native/xpu/ForeachOpScalar.cpp b/src/ATen/native/xpu/ForeachOpScalar.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/BinaryOps.h>
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/ops/_foreach_add_native.h>
 #include <ATen/ops/_foreach_addcdiv_native.h>
@@ -8,6 +9,7 @@
 #include <ATen/ops/_foreach_lerp_native.h>
 #include <ATen/ops/_foreach_mul_native.h>
 #include <ATen/ops/_foreach_pow_native.h>
+#include <ATen/ops/_foreach_sub_native.h>
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h>
@@ -37,7 +39,33 @@ namespace native {
     return xpu::FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)(tensors, scalar);  \
   }
 
+// In the case of subtraction, we dont allow scalar to be boolean following the
+// torch.sub logic
+#define FOREACH_BINARY_OP_SCALAR_NO_BOOLEAN(NAME, DIV_OP)                  \
+  void foreach_tensor_##NAME##_scalar_kernel_xpu_(                         \
+      TensorList tensors, const Scalar& scalar) {                          \
+    check_foreach_api_restrictions(tensors);                               \
+    sub_check(tensors[0], scalar);                                         \
+    if (!can_use_fast_route(tensors, scalar, DIV_OP)) {                    \
+      return foreach_tensor_##NAME##_scalar_kernel_slow_(tensors, scalar); \
+    }                                                                      \
+                                                                           \
+    xpu::FOREACH_BINARY_SCALAR_INPLACE_KERNEL_NAME(NAME)(tensors, scalar); \
+  }                                                                        \
+                                                                           \
+  std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_xpu(           \
+      TensorList tensors, const Scalar& scalar) {                          \
+    check_foreach_api_restrictions(tensors);                               \
+    sub_check(tensors[0], scalar);                                         \
+    if (!can_use_fast_route(tensors, scalar, DIV_OP)) {                    \
+      return foreach_tensor_##NAME##_scalar_kernel_slow(tensors, scalar);  \
+    }                                                                      \
+                                                                           \
+    return xpu::FOREACH_BINARY_SCALAR_KERNEL_NAME(NAME)(tensors, scalar);  \
+  }
+
 FOREACH_BINARY_OP_SCALAR(add, /*div_op*/ false);
+FOREACH_BINARY_OP_SCALAR_NO_BOOLEAN(sub, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALAR(mul, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALAR(div, /*div_op*/ true);
 FOREACH_BINARY_OP_SCALAR(clamp_max, /*div_op*/ true);
diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/BinaryOps.h>
 #include <ATen/native/ForeachUtils.h>
 #include <ATen/ops/_foreach_add_native.h>
 #include <ATen/ops/_foreach_addcdiv_native.h>
@@ -7,6 +8,7 @@
 #include <ATen/ops/_foreach_div_native.h>
 #include <ATen/ops/_foreach_mul_native.h>
 #include <ATen/ops/_foreach_pow_native.h>
+#include <ATen/ops/_foreach_sub_native.h>
 
 #include <ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
@@ -40,6 +42,39 @@ namespace native {
     return xpu::FOREACH_BINARY_SCALARLIST_KERNEL_NAME(NAME)(tensors, scalars); \
   }
 
+// This does not use FOREACH_BINARY_OP_SCALARLIST because
+// In the case of subtraction, we dont allow scalar to be boolean following the
+// torch.sub logic
+void foreach_tensor_sub_scalarlist_kernel_xpu_(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors, scalars);
+  for (const auto i : c10::irange(tensors.size())) {
+    sub_check(tensors[i], scalars[i]);
+  }
+
+  if (!can_use_fast_route({tensors}, scalars, false)) {
+    return foreach_tensor_sub_scalarlist_kernel_slow_(tensors, scalars);
+  }
+
+  xpu::FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL_NAME(sub)(tensors, scalars);
+}
+
+std::vector<Tensor> foreach_tensor_sub_scalarlist_kernel_xpu(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors, scalars);
+  for (const auto i : c10::irange(tensors.size())) {
+    sub_check(tensors[i], scalars[i]);
+  }
+
+  if (!can_use_fast_route({tensors}, scalars, false)) {
+    return foreach_tensor_sub_scalarlist_kernel_slow(tensors, scalars);
+  }
+
+  return xpu::FOREACH_BINARY_SCALARLIST_KERNEL_NAME(sub)(tensors, scalars);
+}
+
 FOREACH_BINARY_OP_SCALARLIST(add, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALARLIST(mul, /*div_op*/ false);
 FOREACH_BINARY_OP_SCALARLIST(div, /*div_op*/ true);
diff --git a/src/ATen/native/xpu/UpSampleTrilinear3d.cpp b/src/ATen/native/xpu/UpSampleTrilinear3d.cpp
@@ -28,7 +28,8 @@ TORCH_IMPL_FUNC(upsample_trilinear3d_backward_out_xpu)
  std::optional<double> scales_h,
  std::optional<double> scales_w,
  const Tensor& grad_input) {
-  globalContext().alertNotDeterministic("upsample_trilinear3d_backward_out_xpu");
+  globalContext().alertNotDeterministic(
+      "upsample_trilinear3d_backward_out_xpu");
   xpu::upsample_trilinear3d_backward_out_kernel(
       grad_input,
       grad_output,
diff --git a/src/ATen/native/xpu/sycl/ChebyshevPolynomialTKernel.cpp b/src/ATen/native/xpu/sycl/ChebyshevPolynomialTKernel.cpp
@@ -1,8 +1,8 @@
 #include <ATen/ATen.h>
 #include <ATen/native/Math.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
+#include <ATen/native/xpu/sycl/Loops.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ChebyshevPolynomialUKernel.cpp b/src/ATen/native/xpu/sycl/ChebyshevPolynomialUKernel.cpp
@@ -1,8 +1,8 @@
 #include <ATen/ATen.h>
 #include <ATen/native/Math.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
+#include <ATen/native/xpu/sycl/Loops.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ChebyshevPolynomialVKernel.cpp b/src/ATen/native/xpu/sycl/ChebyshevPolynomialVKernel.cpp
@@ -1,8 +1,8 @@
 #include <ATen/ATen.h>
 #include <ATen/native/Math.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
+#include <ATen/native/xpu/sycl/Loops.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/ChebyshevPolynomialWKernel.cpp b/src/ATen/native/xpu/sycl/ChebyshevPolynomialWKernel.cpp
@@ -1,8 +1,8 @@
 #include <ATen/ATen.h>
 #include <ATen/native/Math.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/native/xpu/sycl/Loops.h>
 #include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
+#include <ATen/native/xpu/sycl/Loops.h>
 
 namespace at::native::xpu {
 
diff --git a/src/ATen/native/xpu/sycl/Distributions.cpp b/src/ATen/native/xpu/sycl/Distributions.cpp
@@ -1,8 +1,8 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/Distributions.h>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/native/xpu/sycl/Distributions.h>
 #include <ATen/native/xpu/sycl/DistributionTemplates.h>
+#include <ATen/native/xpu/sycl/Distributions.h>
 #include <ATen/native/xpu/sycl/TensorApplyUtils.h>
 #include <ATen/xpu/XPUGeneratorImpl.h>
 
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.cpp
@@ -164,6 +164,16 @@ FOREACH_BINARY_LIST_ALPHA_KERNEL(add) {
       tensor1, tensor2, alpha);
 }
 
+FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(sub) {
+  return all_types_complex_bool_half_bfloat16_<std::minus>(
+      tensor1, tensor2, alpha);
+}
+
+FOREACH_BINARY_LIST_ALPHA_KERNEL(sub) {
+  return all_types_complex_bool_half_bfloat16<std::minus>(
+      tensor1, tensor2, alpha);
+}
+
 FOREACH_BINARY_LIST_INPLACE_KERNEL(mul) {
   return all_types_complex_bool_half_bfloat16_<std::multiplies>(
       tensor1, tensor2);
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpListKernels.h
@@ -33,6 +33,8 @@ namespace at::native::xpu {
 
 TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(add);
 TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_INPLACE_KERNEL(sub);
+TORCH_XPU_API FOREACH_BINARY_LIST_ALPHA_KERNEL(sub);
 TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(mul);
 TORCH_XPU_API FOREACH_BINARY_LIST_KERNEL(mul);
 TORCH_XPU_API FOREACH_BINARY_LIST_INPLACE_KERNEL(div);
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.cpp
@@ -145,6 +145,14 @@ FOREACH_BINARY_SCALAR_KERNEL(add) {
   return all_types_complex_bool_half_bfloat16<std::plus>(tensors, scalar);
 }
 
+FOREACH_BINARY_SCALAR_INPLACE_KERNEL(sub) {
+  return all_types_complex_bool_half_bfloat16_<std::minus>(tensors, scalar);
+}
+
+FOREACH_BINARY_SCALAR_KERNEL(sub) {
+  return all_types_complex_bool_half_bfloat16<std::minus>(tensors, scalar);
+}
+
 FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul) {
   return all_types_complex_bool_half_bfloat16_<std::multiplies>(
       tensors, scalar);
@@ -187,7 +195,8 @@ FOREACH_BINARY_SCALAR_KERNEL(pow) {
 }
 
 FOREACH_BINARY_SCALAR_KERNEL(pow_list) {
-  return all_types_complex_half_bfloat16<reverse_power_functor>(tensors, scalar);
+  return all_types_complex_half_bfloat16<reverse_power_functor>(
+      tensors, scalar);
 }
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarKernels.h
@@ -19,6 +19,8 @@ namespace at::native::xpu {
 
 TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(add);
 TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(sub);
+TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(sub);
 TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(mul);
 TORCH_XPU_API FOREACH_BINARY_SCALAR_KERNEL(mul);
 TORCH_XPU_API FOREACH_BINARY_SCALAR_INPLACE_KERNEL(div);
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.cpp
@@ -137,6 +137,14 @@ FOREACH_BINARY_SCALARLIST_KERNEL(add) {
   return all_types_complex_bool_half_bfloat16<std::plus>(tensors, scalars);
 }
 
+FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(sub) {
+  return all_types_complex_bool_half_bfloat16_<std::minus>(tensors, scalars);
+}
+
+FOREACH_BINARY_SCALARLIST_KERNEL(sub) {
+  return all_types_complex_bool_half_bfloat16<std::minus>(tensors, scalars);
+}
+
 FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul) {
   return all_types_complex_bool_half_bfloat16_<std::multiplies>(
       tensors, scalars);
diff --git a/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h b/src/ATen/native/xpu/sycl/ForeachBinaryOpScalarListKernels.h
@@ -19,6 +19,8 @@ namespace at::native::xpu {
 
 TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(add);
 TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(add);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(sub);
+TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(sub);
 TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(mul);
 TORCH_XPU_API FOREACH_BINARY_SCALARLIST_KERNEL(mul);
 TORCH_XPU_API FOREACH_BINARY_SCALARLIST_INPLACE_KERNEL(div);
diff --git a/src/ATen/native/xpu/sycl/ForeachCopyKernels.cpp b/src/ATen/native/xpu/sycl/ForeachCopyKernels.cpp
@@ -6,7 +6,6 @@
 
 #include <ATen/native/xpu/sycl/ForeachCopyKernels.h>
 
-
 namespace at::native::xpu {
 template <typename T>
 struct Identity {
@@ -15,18 +14,16 @@ struct Identity {
   }
 };
 
-void foreach_copy_list_kernel_(
-    TensorList self,
-    TensorList src) {
-    std::vector<std::vector<at::Tensor>> tensor_lists{src.vec(), self.vec()};
+void foreach_copy_list_kernel_(TensorList self, TensorList src) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{src.vec(), self.vec()};
 
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
-        at::ScalarType::Half,
-        at::ScalarType::BFloat16,
-        at::ScalarType::Bool,
-        self[0].scalar_type(),
-        "foreach_tensor_copy",
-        [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Bool,
+      self[0].scalar_type(),
+      "foreach_tensor_copy",
+      [&]() {
         using opmath_t = at::opmath_type<scalar_t>;
         multi_tensor_apply<2>(
             tensor_lists,
@@ -36,7 +33,7 @@ void foreach_copy_list_kernel_(
                 /* r_args_depth */ 1,
                 /* res_arg_index */ 1>(),
             Identity<opmath_t>());
-        });
+      });
 }
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ForeachCopyKernels.h b/src/ATen/native/xpu/sycl/ForeachCopyKernels.h
@@ -3,8 +3,6 @@
 
 namespace at::native::xpu {
 
-TORCH_XPU_API void foreach_copy_list_kernel_(
-    TensorList self,
-    TensorList src);
+TORCH_XPU_API void foreach_copy_list_kernel_(TensorList self, TensorList src);
 
 } // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/FunctionOfAMatrixUtilsKernels.cpp b/src/ATen/native/xpu/sycl/FunctionOfAMatrixUtilsKernels.cpp
@@ -58,8 +58,7 @@ struct ComputeLinearCombinationInternalKernelFunctor {
 
     auto* RESTRICT out_data =
         reinterpret_cast<scalar_t*>(out_ptr_ + offsets[0]);
-    auto* RESTRICT in_data =
-        reinterpret_cast<scalar_t*>(in_ptr_ + offsets[1]);
+    auto* RESTRICT in_data = reinterpret_cast<scalar_t*>(in_ptr_ + offsets[1]);
     using primitive_t = typename scalar_value_type<scalar_t>::type;
     auto* RESTRICT coeff_data =
         reinterpret_cast<primitive_t*>(coeff_ptr_ + offsets[2]);
diff --git a/src/ATen/native/xpu/sycl/ModifiedBesselI0Kernel.cpp b/src/ATen/native/xpu/sycl/ModifiedBesselI0Kernel.cpp
@@ -16,9 +16,10 @@ struct ModifiedBesselI0Functor {
 };
 
 void modified_bessel_i0_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "modified_bessel_i0_xpu", [&]() {
-    gpu_kernel(iter, ModifiedBesselI0Functor<scalar_t>());
-  });
+  AT_DISPATCH_FLOATING_TYPES(
+      iter.common_dtype(), "modified_bessel_i0_xpu", [&]() {
+        gpu_kernel(iter, ModifiedBesselI0Functor<scalar_t>());
+      });
 }
 
-}
+} // namespace at::native::xpu
diff --git a/src/ATen/native/xpu/sycl/ModifiedBesselI1Kernel.cpp b/src/ATen/native/xpu/sycl/ModifiedBesselI1Kernel.cpp
diff --git a/src/ATen/native/xpu/sycl/ModifiedBesselK0Kernel.cpp b/src/ATen/native/xpu/sycl/ModifiedBesselK0Kernel.cpp
diff --git a/src/ATen/native/xpu/sycl/ModifiedBesselK1Kernel.cpp b/src/ATen/native/xpu/sycl/ModifiedBesselK1Kernel.cpp
diff --git a/src/ATen/native/xpu/sycl/SphericalBesselJ0Kernel.cpp b/src/ATen/native/xpu/sycl/SphericalBesselJ0Kernel.cpp
diff --git a/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp b/src/ATen/native/xpu/sycl/SummaryOpsKernels.cpp
diff --git a/src/ATen/native/xpu/sycl/UpSampleNearest3dKernels.cpp b/src/ATen/native/xpu/sycl/UpSampleNearest3dKernels.cpp
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,5 @@ Tensor& embedding_renorm_xpu_(`
`35`	`35`	`self, indices, max_norm, norm_type);`
`36`	`36`	`}`
`37`	`37`
`38`		`-`
`39`	`38`	`} // namespace native`
`40`	`39`	`} // namespace at`