@@ -31,6 +31,35 @@ function mapreduce_cuda(
31
31
weighted_jacobian = OnesArray (parent (data)),
32
32
opargs... ,
33
33
)
34
+ # This function implements the following parallel reduction algorithm:
35
+ #
36
+ # Each thread in each blocks processes multiple data points at the same time
37
+ # (n_ops_on_load) each and we perform a block-wise reduction, with each
38
+ # block writing to an array of (block-)shared memory. This array has the
39
+ # same size as the block, ie, it is as long as many threads are available.
40
+ # Processing multiple points means that we apply the reduction to the point
41
+ # with index reduction[thread_index] = f(thread_index, thread_index +
42
+ # OFFSET), with various OFFSETS that depend on `n_ops_on_load` and block
43
+ # size.
44
+ #
45
+ # For the purpose of indexing, this is equivalent to having larger blocks
46
+ # with size effective_blksize = blksize * (n_ops_on_load + 1).
47
+ #
48
+ #
49
+ # After this operation, we have reduced all the data by a factor of
50
+ # 1/n_ops_on_load and have results in various arrays `reduction` (one per
51
+ # block)
52
+ #
53
+ # Once we have all the blocks reduced, we perform a tree reduction within
54
+ # the block and "move" the reduced value to the first element of the array.
55
+ # In this, one of the things to watch out for is that the last block might
56
+ # not necessarily have all threads doing work, so we have to be careful to
57
+ # not include data in `reduction` that did not have corresponding work.
58
+ # Threads of index 1 will write that array into an output array.
59
+ #
60
+ # The output array has size nblocks, so we do another round of reduction,
61
+ # but this time we put each Field in a different block.
62
+
34
63
S = eltype (data)
35
64
pdata = parent (data)
36
65
T = eltype (pdata)
@@ -112,7 +141,13 @@ function mapreduce_cuda_kernel!(
112
141
end
113
142
end
114
143
sync_threads ()
115
- _cuda_intrablock_reduce! (op, reduction, tidx, blksize)
144
+
145
+ # The last block might not have enough threads to fill `reduction`, so some
146
+ # of its elements might still have the value at initialization.
147
+ blksize_for_reduction =
148
+ min (blksize, nitems - effective_blksize * (bidx - 1 ))
149
+
150
+ _cuda_intrablock_reduce! (op, reduction, tidx, blksize_for_reduction)
116
151
117
152
tidx == 1 && (reduce_cuda[bidx, fidx] = reduction[1 ])
118
153
return nothing
0 commit comments