ydwu4
diff --git a/‎aten/src/ATen/native/cuda/IndexKernel.cu‎
Lines changed: 27 additions & 7 deletions b/‎aten/src/ATen/native/cuda/IndexKernel.cu‎
Lines changed: 27 additions & 7 deletions
diff --git a/‎aten/src/ATen/native/cuda/IndexKernelUtils.cu‎
Lines changed: 44 additions & 0 deletions b/‎aten/src/ATen/native/cuda/IndexKernelUtils.cu‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/IndexKernelUtils.h‎
Lines changed: 35 additions & 0 deletions b/‎aten/src/ATen/native/cuda/IndexKernelUtils.h‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/MemoryAccess.cuh‎
Lines changed: 119 additions & 0 deletions b/‎aten/src/ATen/native/cuda/MemoryAccess.cuh‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/ScatterGatherKernel.cu‎
Lines changed: 21 additions & 5 deletions b/‎aten/src/ATen/native/cuda/ScatterGatherKernel.cu‎
Lines changed: 21 additions & 5 deletions
@@ -14,6 +14,8 @@
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/KernelUtils.cuh>
 #include <ATen/native/quantized/IndexKernel.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/IndexKernelUtils.h>
 
 #include <c10/core/Scalar.h>
 
@@ -52,7 +54,7 @@ static void launch_kernel(const int64_t N, const func_t& f) {
 }
 
 template <typename func_t>
-void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, const IntArrayRef index_stride, const func_t& f) {
+void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, const IntArrayRef index_stride, const func_t& f, const bool is_gather_like) {
   const auto num_indices = index_size.size();
   AT_ASSERT(num_indices == index_stride.size());
   AT_ASSERT(static_cast<int64_t>(num_indices) == iter.ntensors() - 2);
@@ -63,11 +65,31 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
 
   if (!iter.can_use_32bit_indexing()) {
     for (auto& sub_iter : iter.with_32bit_indexing()) {
-      gpu_index_kernel(sub_iter, index_size, index_stride, f);
+      gpu_index_kernel(sub_iter, index_size, index_stride, f, is_gather_like);
     }
     return;
   }
 
+
+  char* const out_ptr = static_cast<char*>(iter.data_ptr(0));
+  char* const in_ptr = static_cast<char*>(iter.data_ptr(1));
+
+  if (is_gather_like && num_indices==1) {
+      const size_t element_size = iter.element_size(0);
+      constexpr size_t alignment = 16;
+      if (at::native::fast_gather_kernel_eligible<alignment>(iter, out_ptr, in_ptr, index_stride[0], element_size)) {
+        auto slice_size = iter.shape()[0] * element_size;
+        auto num_ind = iter.shape()[1];
+        auto ind_dim_size = index_size[0];
+        auto inp_stride_bytes = index_stride[0];
+        auto out_stride_bytes = iter.strides(0)[1];
+        if (iter.numel() == 0) return;
+        at::native::vectorized_gather_kernel_launch<alignment>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
+        slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
+        return;
+      }
+  }
+
   auto sizes = std::array<int64_t, MAX_DIMS>{};
   auto strides = std::array<int64_t, MAX_DIMS>{};
   auto index_ptrs = std::array<char*, MAX_DIMS>{};
@@ -77,8 +99,6 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
     index_ptrs[i] = (char*)iter.data_ptr(i + 2);
   }
 
-  char* const out_ptr = static_cast<char*>(iter.data_ptr(0));
-  char* const in_ptr = static_cast<char*>(iter.data_ptr(1));
 
   auto offset_calc = make_offset_calculator<3>(iter);
   launch_kernel<launch_size_nd, launch_bound2>(iter.numel(), [=]__device__(int idx) {
@@ -183,14 +203,14 @@ template <typename scalar_t>
 void index_kernel_impl(TensorIteratorBase& iter, const IntArrayRef index_size, const IntArrayRef index_stride) {
   gpu_index_kernel(iter, index_size, index_stride, []C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
     *reinterpret_cast<scalar_t*>(out_data) = *reinterpret_cast<const scalar_t*>(in_data + offset);
-  });
+  }, true);
 }
 
 template <typename scalar_t>
 void index_put_kernel_impl(TensorIterator& iter, const IntArrayRef index_size, const IntArrayRef index_stride) {
   gpu_index_kernel(iter, index_size, index_stride, []C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
     *reinterpret_cast<scalar_t*>(out_data + offset) = *reinterpret_cast<const scalar_t*>(in_data);
-  });
+  }, false);
 }
 
 static void index_kernel(
@@ -280,7 +300,7 @@ void index_put_kernel_quantized_cuda(TensorIterator& iter, const IntArrayRef ind
       // The replacement should generate the same PTX as std::clamp. See https://godbolt.org/z/Wde9KW3v4
       qvalue = (qvalue < qmin) ? qmin : (qmax < qvalue) ? qmax : qvalue;
       *(scalar_t*)(out_data + offset) = static_cast<scalar_t>(qvalue);
-    });
+    }, false);
   });
 }
 
 
@@ -0,0 +1,44 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/ceil_div.h>
+
+namespace at::native {
+template <int Alignment>
+__global__ void vectorized_gather_kernel(char * out, char * inp, int64_t * idx, int num_ind, int64_t slice_size, int64_t ind_dim_size, int64_t inp_stride, int64_t out_stride, bool allow_neg_indices) {
+    int64_t ind = idx[blockIdx.x];
+    if (allow_neg_indices) {
+        ind = (ind < 0) ? ind + ind_dim_size : ind;
+    }
+    CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
+    int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
+    if (off >= slice_size) return;
+    auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
+    at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec);  // out offset is guaranteed to be within int32 limits
+}
+
+
+
+template <int64_t Alignment>
+void vectorized_gather_kernel_launch(char * out, char * inp, int64_t * idx, int num_ind,
+                                     int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, bool allow_neg_indices){
+
+  constexpr int64_t max_num_threads=256;
+  auto num_threads = at::round_up(
+      at::ceil_div(slice_size_in_bytes, Alignment),
+      static_cast<int64_t>(C10_WARP_SIZE));
+  dim3 grid = {static_cast<uint32_t>(num_ind), static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), 1};
+  auto block = std::min(max_num_threads, num_threads);
+  vectorized_gather_kernel<Alignment><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(out, inp, idx, num_ind, slice_size_in_bytes,
+  ind_dim_size, inp_stride_bytes, out_stride_bytes, allow_neg_indices);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+// explicit template instantiation
+template void vectorized_gather_kernel_launch<16>(char * out, char * inp, int64_t * idx, int num_ind, int64_t slice_size_in_bytes,
+int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, bool allow_neg_indices);
+
+}
@@ -0,0 +1,35 @@
+
+#include <cstdint>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+namespace at::native {
+
+template<int alignment>
+inline bool fast_gather_kernel_eligible(const TensorIterator& iter, char * const out_ptr, char * const in_ptr, const size_t index_stride_bytes, const size_t element_size) {
+  using at::native::memory::get_alignment;
+  const auto index_element_size = iter.element_size(2);
+  //TensorIterator strides and sizes are ordered fastest moving to slowest moving,
+  //in contrast to regular sizes
+  // we need contiguous source and dst slices and aligned pointers and strides and slice size to do vectorized loads
+  // also we need idx to be expanded in the last dimension so we can copy entire slices
+  // and we need the src tensor to keep 0 stride from restriding
+  // (it could have been deleted by dimension collapse, in this case iterator would still be 2d
+  // but we cannot use fast path)
+
+  return iter.ndim() == 2 && iter.strides(2)[0]==0 && iter.strides(2)[1]==index_element_size &&
+         static_cast<size_t>(iter.strides(0)[0])==element_size &&
+         static_cast<size_t>(iter.strides(1)[0])==element_size && static_cast<size_t>(iter.strides(1)[1] == 0) &&
+         get_alignment(out_ptr) == alignment && get_alignment(in_ptr) == alignment &&
+         get_alignment(static_cast<size_t>(iter.shape()[0] * element_size)) == alignment &&
+         get_alignment(static_cast<size_t>(index_stride_bytes)) == alignment &&
+         get_alignment(static_cast<size_t>(iter.strides(0)[1])) == alignment;
+}
+
+template <int64_t Alignment>
+void vectorized_gather_kernel_launch(char * out, char * inp, int64_t * idx, int num_ind,
+                                     int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes,
+                                     bool allow_neg_indices=false);
+
+
+}
@@ -536,4 +536,123 @@ inline int can_vectorize_up_to(array_t pointers) {
   return result;
 }
 
+
+
+template <typename T>
+__inline__ size_t get_alignment(T ptr_or_size) {
+  auto val = reinterpret_cast<uintptr_t>(ptr_or_size);
+  if (val % 16 == 0) {
+    return 16;
+  } else if (val % 8 == 0) {
+    return 8;
+  } else if (val % 4 == 0) {
+    return 4;
+  } else if (val % 2 == 0) {
+    return 2;
+  } else {
+    return 1;
+  }
+}
+
+template <>
+__inline__ size_t get_alignment<size_t>(size_t size) {
+  return get_alignment(reinterpret_cast<void*>(size));
+}
+
+template <bool Value, class... Args>
+inline constexpr bool dependent_bool_value = Value;
+
+template <class... Args>
+inline constexpr bool dependent_false = dependent_bool_value<false, Args...>;
+
+template <int Size>
+union Vec;
+
+template <>
+union Vec<4> {
+  uint16_t u16[2];
+  uint32_t u32, as_scalar;
+  float f32;
+};
+
+template <>
+union Vec<8> {
+  uint16_t u16[4];
+  uint32_t u32[2];
+  uint64_t u64, as_scalar;
+  float f32[2];
+};
+
+template <>
+union alignas(16) Vec<16> {
+  uint16_t u16[8];
+  uint32_t u32[4];
+  uint64_t u64[2];
+  uint4 u128, as_scalar;
+  float f32[4];
+};
+
+template <int Alignment, typename T>
+__device__ __inline__ Vec<Alignment> ld_vec(const T* addr) {
+  Vec<Alignment> vec;
+  if constexpr (Alignment == 16) {
+#if defined(USE_ROCM)
+    vec.u128 = *reinterpret_cast<const uint4*>(addr);
+  } else if constexpr (Alignment == 8) {
+    vec.u64 = *reinterpret_cast<const uint64_t*>(addr);
+  } else if constexpr (Alignment == 4) {
+    vec.u32 = *reinterpret_cast<const uint32_t*>(addr);
+#else
+    asm("ld.global.v4.u32 {%0,%1,%2,%3}, [%4];"
+        : "=r"(vec.u32[0]), "=r"(vec.u32[1]), "=r"(vec.u32[2]), "=r"(vec.u32[3])
+        : "l"(addr)
+        : "memory");
+  } else if constexpr (Alignment == 8) {
+    asm("ld.global.v2.u32 {%0,%1}, [%2];"
+        : "=r"(vec.u32[0]), "=r"(vec.u32[1])
+        : "l"(addr)
+        : "memory");
+  } else if constexpr (Alignment == 4) {
+    asm("ld.global.u32 %0, [%1];" : "=r"(vec.u32) : "l"(addr) : "memory");
+#endif
+  } else {
+    static_assert(dependent_false<T>);
+  }
+  return vec;
+}
+
+template <int Alignment, typename T>
+__device__ __inline__ void st_vec(T* addr, const Vec<Alignment>& vec) {
+  if constexpr (Alignment == 16) {
+#if defined(USE_ROCM)
+    reinterpret_cast<uint64_t*>(addr)[0] = vec.u64[0];
+    reinterpret_cast<uint64_t*>(addr)[1] = vec.u64[1];
+  } else if constexpr (Alignment == 8) {
+    *reinterpret_cast<uint64_t*>(addr) = vec.u64;
+  } else if constexpr (Alignment == 4) {
+    *reinterpret_cast<uint32_t*>(addr) = vec.u32;
+#else
+    asm("st.global.v4.u32 [%0], {%1,%2,%3,%4};"
+        :
+        : "l"(addr),
+          "r"(vec.u32[0]),
+          "r"(vec.u32[1]),
+          "r"(vec.u32[2]),
+          "r"(vec.u32[3])
+        : "memory");
+  } else if constexpr (Alignment == 8) {
+    asm("st.global.v2.u32 [%0], {%1,%2};"
+        :
+        : "l"(addr), "r"(vec.u32[0]), "r"(vec.u32[1])
+        : "memory");
+  } else if constexpr (Alignment == 4) {
+    asm("st.global.u32 [%0], %1;" : : "l"(addr), "r"(vec.u32) : "memory");
+#endif
+  } else {
+    static_assert(dependent_false<T>);
+  }
+}
+
+
+
 } // namespace at::native::memory
@@ -1,16 +1,16 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/TensorAdvancedIndexing.h>
-
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
+#include <ATen/ceil_div.h>
 #include <ATen/MemoryOverlap.h>
 
 #include <ATen/native/ScatterGatherChecks.h>
 #include <ATen/native/ReduceOpsUtils.h>
-#include <ATen/native/TensorIterator.h>
-
+#include <ATen/native/cuda/IndexKernelUtils.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/KernelUtils.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/cuda/Atomic.cuh>
 #include <ATen/cuda/CUDAContext.h>
@@ -116,7 +116,6 @@ static void _launch_scatter_gather_kernel(int64_t N, const func_t& f) {
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
-
 template <bool is_scatter_like, typename scalar_t>
 struct _cuda_scatter_gather_internal_kernel {
   template <typename func_t>
@@ -140,13 +139,29 @@ struct _cuda_scatter_gather_internal_kernel {
     char* src_ptr = (char*)iter.data_ptr(1);
     char* index_ptr = (char*)iter.data_ptr(2);
 
+    if constexpr (!is_scatter_like) {
+      // we can go to faster path if we are indexing on the first dim
+      // the dst and src are contiguous and all the dims and pts are multiple of 16
+      constexpr size_t element_size = sizeof(scalar_t);
+      constexpr size_t alignment = 16;
+      if (at::native::fast_gather_kernel_eligible<alignment>(iter, self_ptr, src_ptr, index_stride * element_size, element_size)) {
+        auto slice_size = iter.shape()[0] * element_size;
+        auto num_ind = iter.shape()[1];
+        auto ind_dim_size = index_size;
+        auto inp_stride_bytes = index_stride * element_size;
+        auto out_stride_bytes = iter.strides(0)[1];
+        if (iter.numel() == 0) return;
+        at::native::vectorized_gather_kernel_launch<alignment>(self_ptr, src_ptr, (int64_t*)index_ptr, num_ind, slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes);
+        return;
+      }
+    }
     auto offset_calc = make_offset_calculator<3>(iter);
     auto loop = [=]C10_DEVICE(int i) {
       auto offsets = offset_calc.get(i);
 
       int64_t idx_dim = *(int64_t*)(index_ptr + offsets[2]);
       CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
-        && "index out of bounds");
+        && "scatter gather kernel index out of bounds");
 
       f(
         (scalar_t*)(self_ptr + offsets[0]),
@@ -157,6 +172,7 @@ struct _cuda_scatter_gather_internal_kernel {
     };
 
     _launch_scatter_gather_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
+
   }
 }; // struct _cuda_scatter_gather_internal_kernel