Refactor Triangular Operation Kernels (triu/tril) for XPU in SYCL (#1735)

chunhuanMeng · Copilot · toyxu · web-flow · commit 3a9419c8bb6a · 2025-06-19T02:17:58.000Z
Fix pytorch/pytorch#155651 Refactor the triangular operations kernels (`triu` and `tril`) for XPU in SYCL. Key changes include introducing support for element-wise operations within threads, improving memory access patterns, and simplifying the handling of tensor metadata. Additionally, redundant code has been removed for cleaner implementation. ### Refactoring: * **Refactored `ApplyTriuTrilKernelFunctor`**: - Added support for processing multiple elements per thread (elements_per_thread) and optimized memory access patterns (e.g., load load load load, compute compute compute compute, store store store store) to improve memory throughput. - Refactored offset computation to handle multi-dimensional tensors more effectively using `TensorInfo`. - Introduced `BOOL_SWITCH` macro for conditional execution based on whether the operation is in-place. * **Improved handling of tensor metadata**: - Replaced manual stride and size calculations with `TensorInfo` for cleaner and more maintainable code. ### Code Cleanup: * **Removed redundant return statements**: - Eliminated commented-out `return result;` lines in `tril_kernel` and `triu_kernel` functions for clarity. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yutao Xu <yutao.xu@intel.com>
diff --git a/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp b/src/ATen/native/xpu/sycl/TriangularOpsKernels.cpp
@@ -6,112 +6,162 @@
 #include <ATen/native/CanUse32BitIndexMath.h>
 #include <ATen/native/Resize.h>
 #include <comm/SYCLContext.h>
+#include <comm/TensorInfo.h>
 
 #include <ATen/native/xpu/sycl/TriangularOpsKernels.h>
 
+#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static bool CONST_NAME = true;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static bool CONST_NAME = false; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
 namespace at::native::xpu {
 
 using namespace at::xpu;
 
-template <typename scalar_t, typename IndexType, bool upper>
+template <
+    typename scalar_t,
+    typename IndexType,
+    bool upper,
+    int elements_per_thread,
+    bool inplace>
 struct ApplyTriuTrilKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    for (size_t linearIndex = item.get_global_id(0); linearIndex < (size_t)N;
-         linearIndex += item.get_global_range()[0]) {
-      IndexType batch_id = linearIndex / (self_size_0 * self_size_1);
-      IndexType row = (linearIndex % (self_size_0 * self_size_1)) / self_size_1;
-      IndexType col = (linearIndex % (self_size_0 * self_size_1)) % self_size_1;
-
-      IndexType src_index =
-          batch_id * self_stride + row * self_stride_0 + col * self_stride_1;
-      IndexType tgt_index = batch_id * result_stride + row * result_stride_0 +
-          col * result_stride_1;
-
-      bool mask = upper ? (col - row >= k) : (col - row <= k);
-      result_ptr[tgt_index] = mask ? self_ptr[src_index] : scalar_t(0);
+    IndexType linear_idx = item.get_global_id(0) * elements_per_thread;
+    if (linear_idx >= N_padded_) {
+      return;
+    }
+    auto dims = self_info_.dims;
+
+    // Compute column index amd row index
+    IndexType col = linear_idx % last_dim_padded_;
+    linear_idx /= last_dim_padded_;
+    IndexType row = linear_idx % self_info_.sizes[dims - 2];
+
+    if constexpr (inplace) {
+      bool mask_all_true =
+          upper ? (col - row >= k_) : (col + elements_per_thread - row <= k_);
+      if (mask_all_true)
+        return;
+    }
+
+    // Compute offset
+    IndexType self_offset = 0, result_offset = 0;
+    self_offset += self_info_.strides[dims - 1] * col;
+    result_offset += result_info_.strides[dims - 1] * col;
+    linear_idx /= self_info_.sizes[dims - 2];
+    self_offset += self_info_.strides[dims - 2] * row;
+    result_offset += result_info_.strides[dims - 2] * row;
+
+    // Compute remaining offsets
+    IndexType running_index;
+    for (int i = dims - 3; i >= 0; --i) {
+      running_index = linear_idx % self_info_.sizes[i];
+      linear_idx /= self_info_.sizes[i];
+      self_offset += running_index * self_info_.strides[i];
+      result_offset += running_index * result_info_.strides[i];
+    }
+
+    if constexpr (inplace) {
+#pragma unroll
+      for (int i = 0;
+           i < elements_per_thread && col + i < self_info_.sizes[dims - 1];
+           i++) {
+        bool mask = upper ? (col + i - row >= k_) : (col + i - row <= k_);
+        if (!mask)
+          result_info_
+              .data[result_offset + i * result_info_.strides[dims - 1]] =
+              scalar_t(0);
+      }
+    } else {
+      scalar_t frag[elements_per_thread] = {};
+      bool has_mask = (upper && col + elements_per_thread - row >= k_) ||
+          (!upper && col - row <= k_);
+      if (has_mask) {
+#pragma unroll
+        for (int i = 0;
+             i < elements_per_thread && col + i < self_info_.sizes[dims - 1];
+             i++)
+          frag[i] =
+              self_info_.data[self_offset + i * self_info_.strides[dims - 1]];
+
+#pragma unroll
+        for (int i = 0; i < elements_per_thread; i++) {
+          bool mask = upper ? (col + i - row >= k_) : (col + i - row <= k_);
+          frag[i] = mask ? frag[i] : scalar_t(0);
+        }
+      }
+
+#pragma unroll
+      for (int i = 0;
+           i < elements_per_thread && col + i < self_info_.sizes[dims - 1];
+           i++)
+        result_info_.data[result_offset + i * result_info_.strides[dims - 1]] =
+            frag[i];
     }
   }
   ApplyTriuTrilKernelFunctor(
-      const int64_t k_,
-      int64_t N_,
-      IndexType self_size_0_,
-      IndexType self_size_1_,
-      IndexType self_stride_,
-      IndexType self_stride_0_,
-      IndexType self_stride_1_,
-      IndexType result_stride_,
-      IndexType result_stride_0_,
-      IndexType result_stride_1_,
-      scalar_t* result_ptr_,
-      const scalar_t* self_ptr_)
-      : k(k_),
-        N(N_),
-        self_size_0(self_size_0_),
-        self_size_1(self_size_1_),
-        self_stride(self_stride_),
-        self_stride_0(self_stride_0_),
-        self_stride_1(self_stride_1_),
-        result_stride(result_stride_),
-        result_stride_0(result_stride_0_),
-        result_stride_1(result_stride_1_),
-        result_ptr(result_ptr_),
-        self_ptr(self_ptr_) {}
+      at::xpu::detail::TensorInfo<scalar_t, IndexType> result_info,
+      at::xpu::detail::TensorInfo<const scalar_t, IndexType> self_info,
+      const int64_t k,
+      const int64_t N_padded,
+      const IndexType last_dim_padded)
+      : result_info_(result_info),
+        self_info_(self_info),
+        k_(k),
+        N_padded_(N_padded),
+        last_dim_padded_(last_dim_padded) {}
 
  private:
-  const int64_t k;
-  int64_t N;
-  IndexType self_size_0;
-  IndexType self_size_1;
-  IndexType self_stride;
-  IndexType self_stride_0;
-  IndexType self_stride_1;
-  IndexType result_stride;
-  IndexType result_stride_0;
-  IndexType result_stride_1;
-  scalar_t* result_ptr;
-  const scalar_t* self_ptr;
+  at::xpu::detail::TensorInfo<scalar_t, IndexType> result_info_;
+  at::xpu::detail::TensorInfo<const scalar_t, IndexType> self_info_;
+  const int64_t k_;
+  const int64_t N_padded_;
+  const IndexType last_dim_padded_;
 };
 
 template <typename scalar_t, typename IndexType, bool upper>
 void apply_triu_tril(
     const Tensor& result,
     const Tensor& self,
     const int64_t k) {
-  auto N = self.numel();
-  IndexType self_size_0 = (IndexType)self.size(-2);
-  IndexType self_size_1 = (IndexType)self.size(-1);
-  IndexType self_stride = (IndexType)(self.dim() > 2 ? self.stride(-3) : 1);
-  IndexType self_stride_0 = (IndexType)self.stride(-2);
-  IndexType self_stride_1 = (IndexType)self.stride(-1);
-  IndexType result_stride =
-      (IndexType)(result.dim() > 2 ? result.stride(-3) : 1);
-  IndexType result_stride_0 = (IndexType)result.stride(-2);
-  IndexType result_stride_1 = (IndexType)result.stride(-1);
-
-  scalar_t* result_ptr = result.data_ptr<scalar_t>();
-  const scalar_t* self_ptr = self.const_data_ptr<scalar_t>();
-
-  ApplyTriuTrilKernelFunctor<scalar_t, IndexType, upper> kfn(
-      k,
-      N,
-      self_size_0,
-      self_size_1,
-      self_stride,
-      self_stride_0,
-      self_stride_1,
-      result_stride,
-      result_stride_0,
-      result_stride_1,
-      result_ptr,
-      self_ptr);
-
-  int64_t group_size = syclMaxWorkGroupSize(kfn);
-  auto num_groups = ceil_div(N, group_size);
-  auto total_items = num_groups * group_size;
-  auto& queue = getCurrentSYCLQueue();
-
-  sycl_kernel_submit(
-      sycl::range<1>(total_items), sycl::range<1>(group_size), queue, kfn);
+  constexpr int elements_per_thread =
+      sizeof(scalar_t) < 8 ? 8 / sizeof(scalar_t) : 1;
+  auto sizes = self.sizes();
+  int64_t last_dim_padded =
+      round_up<int64_t>(sizes.back(), elements_per_thread);
+  int64_t N_padded =
+      c10::multiply_integers(sizes.begin(), sizes.end() - 1) * last_dim_padded;
+
+  int64_t local_range = syclMaxWorkItemsPerSubSlice();
+  int64_t global_range =
+      ((N_padded / elements_per_thread + local_range - 1) / local_range) *
+      local_range;
+
+  auto result_info =
+      at::xpu::detail::getTensorInfo<scalar_t, IndexType>(result);
+  auto self_info =
+      at::xpu::detail::getTensorInfo<const scalar_t, IndexType>(self);
+  BOOL_SWITCH(self.is_same(result), inplace, [&] {
+    ApplyTriuTrilKernelFunctor<
+        scalar_t,
+        IndexType,
+        upper,
+        elements_per_thread,
+        inplace>
+        kfn(result_info, self_info, k, N_padded, last_dim_padded);
+    sycl_kernel_submit(
+        sycl::range<1>(global_range),
+        sycl::range<1>(local_range),
+        getCurrentSYCLQueue(),
+        kfn);
+  });
 }
 
 #define TRIU_TRIL_LAMBDA(upper)                                   \
@@ -128,7 +178,6 @@ void tril_kernel(const Tensor& result, const Tensor& self, int64_t k) {
     result.resize_as_(self);
   }
   if (self.numel() == 0) {
-    // return result;
     return;
   }
 
@@ -140,16 +189,13 @@ void tril_kernel(const Tensor& result, const Tensor& self, int64_t k) {
       self.scalar_type(),
       "tril_xpu",
       TRIU_TRIL_LAMBDA(false));
-
-  // return result;
 }
 
 void triu_kernel(const Tensor& result, const Tensor& self, int64_t k) {
   if (result.sizes() != self.sizes()) {
     result.resize_as_(self);
   }
   if (self.numel() == 0) {
-    // return result;
     return;
   }
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
@@ -160,8 +206,6 @@ void triu_kernel(const Tensor& result, const Tensor& self, int64_t k) {
       self.scalar_type(),
       "triu_xpu",
       TRIU_TRIL_LAMBDA(true));
-
-  // return result;
 }
 
 } // namespace at::native::xpu
diff --git a/test/regressions/test_tril.py b/test/regressions/test_tril.py
@@ -0,0 +1,13 @@
+# Owner(s): ["module: intel"]
+import torch
+from torch.testing._internal.common_utils import TestCase
+
+
+class TestSimpleBinary(TestCase):
+    def test_tril(self, dtype=torch.bool):
+        max_seq_length = 131072
+        with torch.device("xpu"):
+            a = torch.ones(max_seq_length, max_seq_length, dtype=torch.bool)
+            causal_mask = torch.tril(a)
+        torch.xpu.synchronize()
+        print(torch.xpu.get_device_properties())