@@ -81,7 +81,7 @@ template <typename T, int block_size>
81
81
__global__ void SelectedRowsAddTensorKernel (const T* selected_rows,
82
82
const int64_t * rows, T* tensor_out,
83
83
int64_t row_numel) {
84
- const int ty = blockIdx .y ;
84
+ const int ty = blockIdx .x ;
85
85
int tid = threadIdx .x ;
86
86
87
87
selected_rows += ty * row_numel;
@@ -123,7 +123,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
123
123
124
124
const int block_size = 256 ;
125
125
dim3 threads (block_size, 1 );
126
- dim3 grid (1 , in1_rows.size ());
126
+ dim3 grid (in1_rows.size (), 1 );
127
127
SelectedRowsAddTensorKernel<
128
128
T, block_size><<<grid, threads, 0 , context.stream()>>> (
129
129
in1_data, in1_rows.CUDAData (context.GetPlace ()), out_data,
@@ -188,7 +188,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
188
188
const int64_t * rows,
189
189
T* tensor_out,
190
190
int64_t row_numel) {
191
- const int ty = blockIdx .y ;
191
+ const int ty = blockIdx .x ;
192
192
int tid = threadIdx .x ;
193
193
194
194
selected_rows += ty * row_numel;
@@ -221,7 +221,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
221
221
auto * in2_data = input2->data <T>();
222
222
const int block_size = 256 ;
223
223
dim3 threads (block_size, 1 );
224
- dim3 grid (1 , in1_rows.size ());
224
+ dim3 grid (in1_rows.size (), 1 );
225
225
SelectedRowsAddToTensorKernel<
226
226
T, block_size><<<grid, threads, 0 , context.stream()>>> (
227
227
in1_data, in1_rows.CUDAData (context.GetPlace ()), in2_data,
@@ -388,7 +388,7 @@ template <typename T, int block_size>
388
388
__global__ void UpdateToTensorKernel (const T* selected_rows,
389
389
const int64_t * rows, const ScatterOps& op,
390
390
T* tensor_out, int64_t row_numel) {
391
- const int ty = blockIdx .y ;
391
+ const int ty = blockIdx .x ;
392
392
int tid = threadIdx .x ;
393
393
394
394
selected_rows += ty * row_numel;
@@ -457,7 +457,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
457
457
auto * in2_data = input2->data <T>();
458
458
459
459
dim3 threads (platform::PADDLE_CUDA_NUM_THREADS, 1 );
460
- dim3 grid (1 , in1_rows.size ());
460
+ dim3 grid (in1_rows.size (), 1 );
461
461
UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
462
462
grid, threads, 0 , context.stream()>>> (in1_data, in1_rows.cuda_data (),
463
463
op, in2_data, in1_row_numel);
0 commit comments