1+ #include " common.cuh"
2+
3+ namespace airbender ::ops::cub::device_reduce {
4+
5+ #define REDUCE (op, arg_t ) \
6+ EXTERN cudaError_t ab_reduce_##op##_##arg_t (void *d_temp_storage, size_t &temp_storage_bytes, const arg_t *d_in, arg_t *d_out, const int num_items, \
7+ const cudaStream_t stream) { \
8+ return DeviceReduce::Reduce (d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, op<arg_t >(), op<arg_t >::init (), stream); \
9+ }
10+
11+ REDUCE (add, bf);
12+ REDUCE (add, e2 );
13+ REDUCE (add, e4 );
14+ REDUCE (add, e6 );
15+ REDUCE (mul, bf);
16+ REDUCE (mul, e2 );
17+ REDUCE (mul, e4 );
18+ REDUCE (mul, e6 );
19+
20+ struct offset_iterator {
21+ #if CUB_VERSION >= 200300
22+ using iterator_category = cuda::std::random_access_iterator_tag;
23+ using value_type = int ;
24+ using difference_type = int ;
25+ using pointer = int *;
26+ using reference = int &;
27+ #endif
28+ const int offset;
29+ const int stride;
30+ DEVICE_FORCEINLINE int operator [](const int idx) const { return offset + idx * stride; }
31+ };
32+
33+ #define SEGMENTED_REDUCE (op, arg_t ) \
34+ EXTERN cudaError_t ab_segmented_reduce_##op##_##arg_t (void *d_temp_storage, size_t &temp_storage_bytes, const matrix_accessor<arg_t > d_in, arg_t *d_out, \
35+ const int num_segments, const int num_items, const cudaStream_t stream) { \
36+ const int stride = static_cast <int >(d_in.stride ); \
37+ const offset_iterator d_begin_offsets{0 , stride}; \
38+ const offset_iterator d_end_offsets{num_items, stride}; \
39+ return DeviceSegmentedReduce::Reduce (d_temp_storage, temp_storage_bytes, d_in.ptr , d_out, num_segments, d_begin_offsets, d_end_offsets, op<arg_t >(), \
40+ op<arg_t >::init (), stream); \
41+ }
42+
43+ SEGMENTED_REDUCE (add, bf);
44+ SEGMENTED_REDUCE (add, e2 );
45+ SEGMENTED_REDUCE (add, e4 );
46+ SEGMENTED_REDUCE (add, e6 );
47+ SEGMENTED_REDUCE (mul, bf);
48+ SEGMENTED_REDUCE (mul, e2 );
49+ SEGMENTED_REDUCE (mul, e4 );
50+ SEGMENTED_REDUCE (mul, e6 );
51+
52+ } // namespace airbender::ops::cub::device_reduce
0 commit comments