algorithm1832
diff --git a/‎paddle/phi/kernels/funcs/broadcast_function.h‎
Lines changed: 3 additions & 1 deletion b/‎paddle/phi/kernels/funcs/broadcast_function.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h‎
Lines changed: 6 additions & 2 deletions b/‎paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/funcs/fake_quantize_functor.cu‎
Lines changed: 9 additions & 3 deletions b/‎paddle/phi/kernels/funcs/fake_quantize_functor.cu‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎paddle/phi/kernels/funcs/fc_functor.cu‎
Lines changed: 3 additions & 1 deletion b/‎paddle/phi/kernels/funcs/fc_functor.cu‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/phi/kernels/funcs/math_function.cu‎
Lines changed: 4 additions & 1 deletion b/‎paddle/phi/kernels/funcs/math_function.cu‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎paddle/phi/kernels/funcs/norm_utils.cu.h‎
Lines changed: 6 additions & 2 deletions b/‎paddle/phi/kernels/funcs/norm_utils.cu.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/funcs/quant_dequant.h‎
Lines changed: 42 additions & 12 deletions b/‎paddle/phi/kernels/funcs/quant_dequant.h‎
Lines changed: 42 additions & 12 deletions
diff --git a/‎paddle/phi/kernels/funcs/scatter.cu.h‎
Lines changed: 2 additions & 1 deletion b/‎paddle/phi/kernels/funcs/scatter.cu.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h‎
Lines changed: 6 additions & 2 deletions b/‎paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/funcs/sparse/scatter.cu.h‎
Lines changed: 6 additions & 2 deletions b/‎paddle/phi/kernels/funcs/sparse/scatter.cu.h‎
Lines changed: 6 additions & 2 deletions
@@ -212,7 +212,9 @@ struct BroadcastDataLoader<Index, VecSize, false, kElementwise> {
     using VecType = phi::kps::details::VectorType<Type, VecSize>;
     VecType vec_temp;
 
-    int thread_offset = threadIdx.x + blockIdx.x * blockDim.x;
+    int64_t thread_offset =
+        static_cast<int64_t>(threadIdx.x) +
+        static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
     const VecType *__restrict__ vec_input =
         reinterpret_cast<const VecType *__restrict__>(ins[Index]);
     vec_temp = vec_input[thread_offset];
 
@@ -128,7 +128,9 @@ __global__ void KeFastCollectiveGruGate(T *gate_value,
   T c0 = 0.0f;
   T b0[Tiled_size];
 
-  int COL = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t COL =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int Tiled_mask = ((1 << Tiled_size) - 1);
   // Tiled  matrix multiply using register shift, faster than sm.
   if (prev_output_value) {
@@ -185,7 +187,9 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight,
                                        int frame_size,
                                        ActivationType act_node,
                                        bool origin_mode) {
-  int COL = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t COL =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
 
   T a0 = 0.0f;
   T b0[Tiled_size];
 
@@ -29,7 +29,9 @@ struct QuantizeDataType<phi::float16> {
 
 template <typename T>
 __global__ void FindAbsMaxKernel(const T *in, const int64_t n, T *out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t bid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   int tid = threadIdx.x;
 
   extern __shared__ char *shared_max_data_tmp[];
@@ -70,7 +72,9 @@ __global__ void ClipAndQuantKernel(const T *in,
                                    const int round_type,
                                    const int64_t n,
                                    T *out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t bid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   int tid = threadIdx.x;
 
   using ComputeDataType = typename QuantizeDataType<T>::type;
@@ -155,7 +159,9 @@ __global__ void ClipAndQuantDequantKernel(const T *in,
                                           const int round_type,
                                           const int64_t n,
                                           T *out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t bid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   int tid = threadIdx.x;
 
   using ComputeDataType = typename QuantizeDataType<T>::type;
 
@@ -63,7 +63,9 @@ struct FcTypeTraits<float16> {
 
 template <typename T, bool DoRelu>
 __global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t tid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   if (tid < num) {
     int bias_idx = tid % K;
     const T bias_ptr = bias[bias_idx];
 
@@ -209,7 +209,10 @@ DEFINE_GPU_TRANS(6);
 
 template <typename T>
 __global__ void FillConstantKernel(const int N, T* a, const T val) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+  for (int64_t i =
+           static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+           static_cast<int64_t>(threadIdx.x);
+       i < N;
        i += blockDim.x * gridDim.x) {
     a[i] = val;
   }
 
@@ -370,7 +370,9 @@ __global__ void DoubleGradComputeDXWithGlobal(const T *dy,
                                               const int sample_size,
                                               const int64_t num,
                                               T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t gid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int stride = blockDim.x * gridDim.x;
   if (ddscale != nullptr) {
     for (int64_t i = gid; i < num; i += stride) {
@@ -397,7 +399,9 @@ __global__ void DoubleGradComputeDDYWithGlobal(const T *ddx,
                                                const int sample_size,
                                                const int64_t num,
                                                T *ddy) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t gid =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
   int stride = blockDim.x * gridDim.x;
 
   if (ddx != nullptr) {
 
@@ -91,8 +91,13 @@ __global__ void QuantKernel(const T* input,
                             const int round_type,
                             const float max_bound,
                             const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x))
+      << 2;
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -118,8 +123,13 @@ __global__ void QuantKernelWithVecSize(const T* input,
                                        const int round_type,
                                        const float max_bound,
                                        const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x))
+      << 2;
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -145,8 +155,13 @@ __global__ void QuantKernelWithVecSize(const T* input,
                                        const int round_type,
                                        const float max_bound,
                                        const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 3;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x)) *
+      3;
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -170,8 +185,13 @@ __global__ void QuantKernelWithVecSize(const T* input,
                                        const int round_type,
                                        const float max_bound,
                                        const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x)) *
+      2;
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -193,8 +213,12 @@ __global__ void QuantKernelWithVecSize(const T* input,
                                        const int round_type,
                                        const float max_bound,
                                        const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x);
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t n_id =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x));
+  int64_t m_id =
+      static_cast<int64_t>(blockIdx.y) * static_cast<int64_t>(blockDim.y) +
+      static_cast<int64_t>(threadIdx.y);
 
   bool check = ((m_id < m) && (n_id < n));
   if (check) {
@@ -320,7 +344,10 @@ __global__ void DequantKernel(T* output,
                               const float* dequant_out_scale_data) {
   int numel = m * n;
   int stride = blockDim.x * gridDim.x * VecSize;
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
+  int64_t idx =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x)) *
+      VecSize;
   int col_id = idx % n;
 
   phi::AlignedVector<int32_t, VecSize> in_vec;
@@ -366,7 +393,10 @@ __global__ void DequantKernelWithScaleOfInputAndWeight(
     float quant_max_bound) {
   int numel = m * n;
   int stride = blockDim.x * gridDim.x * VecSize;
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
+  int64_t idx =
+      (static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+       static_cast<int64_t>(threadIdx.x)) *
+      VecSize;
   int col_id = idx % n;
 
   phi::AlignedVector<int32_t, VecSize> in_vec;
 
@@ -402,7 +402,8 @@ inline DenseTensor restride_dim(const phi::DenseTensor& src,
 template <int nt, int vt, typename func_t>
 __global__ void scatter_gather_elementwise_kernel(int N, func_t f) {
   constexpr int nv = nt * vt;
-  int idx = nv * blockIdx.x + threadIdx.x;
+  int64_t idx =
+      nv * static_cast<int64_t>(blockIdx.x) + static_cast<int64_t>(threadIdx.x);
 
 #pragma unroll
   for (int i = 0; i < vt; ++i) {
 
@@ -26,7 +26,9 @@ __global__ void FlattenIndicesKernel(const IntT* indices,
                                      const int64_t non_zero_num,
                                      const int64_t sparse_dim,
                                      IntT* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t tid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   phi::funcs::sparse::FlattenIndices<IntT>(indices,
                                            sparse_offsets,
                                            non_zero_num,
@@ -42,7 +44,9 @@ __global__ void IndexToCoordinateKernel(const IntT* index,
                                         const int64_t non_zero_num,
                                         const int64_t sparse_dim,
                                         IntT* indices) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t tid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   IndexToCoordinate(index,
                     dims,
                     non_zero_num,
 
@@ -41,7 +41,9 @@ __global__ void ScatterKernel(const T* input,
                               const int rulebook_len,
                               const int channels,
                               T* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t tid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   const int vec_channels = channels / VecSize;
   using LoadT = phi::AlignedVector<T, VecSize>;
   using StoreT = phi::AlignedVector<T, VecSize>;
@@ -82,7 +84,9 @@ __global__ void ScatterKernelV2(const T* input,
                                 const int channels,
                                 const int buffer_counts,
                                 T* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t tid =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
   const int vec_channels = channels / VecSize;
   using LoadT = phi::AlignedVector<T, VecSize>;
   using StoreT = phi::AlignedVector<T, VecSize>;