@@ -178,6 +178,19 @@ __global__ void ReduceValidKernel(int* data, int valid_items) {
178
178
data[threadid] = output;
179
179
}
180
180
181
+ __global__ void SumValidKernel (int * data, int valid_items) {
182
+ typedef cub::WarpReduce<int > WarpReduce;
183
+
184
+ __shared__ typename WarpReduce::TempStorage temp1;
185
+
186
+ int threadid = threadIdx .x + threadIdx .y * blockDim .x + threadIdx .z * blockDim .x * blockDim .y + blockIdx .x * blockDim .x * blockDim .y * blockDim .z ;
187
+
188
+ int input = data[threadid];
189
+ int output = 0 ;
190
+ output = WarpReduce (temp1).Sum (input, valid_items);
191
+ data[threadid] = output;
192
+ }
193
+
181
194
__global__ void ThreadLoadKernel (int * data) {
182
195
int threadid = threadIdx .x + threadIdx .y * blockDim .x + threadIdx .z * blockDim .x * blockDim .y + blockIdx .x * blockDim .x * blockDim .y * blockDim .z ;
183
196
@@ -479,6 +492,18 @@ int main() {
479
492
print_data (dev_data, 1 );
480
493
}
481
494
495
+ init_data (dev_data, DATA_NUM);
496
+ SumValidKernel<<<GridSize, BlockSize>>> (dev_data, valid_items);
497
+ cudaDeviceSynchronize ();
498
+ if (!verify_data (dev_data, expect_valid10, 1 , 1 )) {
499
+ std::cout << " ReduceValidKernel" << " verify failed" << std::endl;
500
+ Result = false ;
501
+ std::cout << " expect:" << std::endl;
502
+ print_data (expect10, 1 );
503
+ std::cout << " current result:" << std::endl;
504
+ print_data (dev_data, 1 );
505
+ }
506
+
482
507
GridSize = {2 };
483
508
BlockSize = {16 , 8 , 1 };
484
509
int expect11[DATA_NUM] = {
0 commit comments