|
126 | 126 |
|
127 | 127 | # Write this block's final prefix to global array and set flag to "block prefix computed" |
128 | 128 | if bi == 0x2 * block_size - 0x1 |
129 | | - prefixes[iblock + 0x1] = temp[bi + bank_offset_b + 0x1] |
130 | | - flags[iblock + 0x1] = ACC_FLAG_P |
| 129 | + |
| 130 | + # Known at compile-time; used in the first pass of the ScanPrefixes algorithm |
| 131 | + if !isnothing(prefixes) |
| 132 | + prefixes[iblock + 0x1] = temp[bi + bank_offset_b + 0x1] |
| 133 | + end |
| 134 | + |
| 135 | + # Known at compile-time; used only in the DecoupledLookback algorithm |
| 136 | + if !isnothing(flags) |
| 137 | + flags[iblock + 0x1] = ACC_FLAG_P |
| 138 | + end |
131 | 139 | end |
132 | 140 |
|
133 | 141 | if block_offset + ai < len |
|
192 | 200 | end |
193 | 201 |
|
194 | 202 |
|
| 203 | +@kernel cpu=false inbounds=true function _accumulate_previous_coupled_preblocks!(op, v, prefixes) |
| 204 | + |
| 205 | + # No decoupled lookback |
| 206 | + len = length(v) |
| 207 | + block_size = @groupsize()[1] |
| 208 | + |
| 209 | + # NOTE: for many index calculations in this library, computation using zero-indexing leads to |
| 210 | + # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero |
| 211 | + # indexing). Internal calculations will be done using zero indexing except when actually |
| 212 | + # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. |
| 213 | + |
| 214 | + # Group (block) and local (thread) indices |
| 215 | + iblock = @index(Group, Linear) - 0x1 + 0x1 # Skipping first block |
| 216 | + ithread = @index(Local, Linear) - 0x1 |
| 217 | + block_offset = iblock * block_size * 0x2 # Processing two elements per thread |
| 218 | + |
| 219 | + # Each block looks back to find running prefix sum |
| 220 | + running_prefix = prefixes[iblock - 0x1 + 0x1] |
| 221 | + |
| 222 | + # The prefixes were pre-accumulated, which means (for block_size=N): |
| 223 | + # - If there were N or fewer prefixes (so fewer than N*N elements in v to begin with), the |
| 224 | + # prefixes were fully accumulated and we can use them directly. |
| 225 | + # - If there were more than N prefixes, each chunk of N prefixes was accumulated, but not |
| 226 | + # along the chunks. We need to accumulate the prefixes of the previous chunks into |
| 227 | + # running_prefix. |
| 228 | + num_preblocks = (iblock - 0x1) ÷ (block_size * 0x2) |
| 229 | + for i in 0x1:num_preblocks |
| 230 | + running_prefix = op(running_prefix, prefixes[i * block_size * 0x2]) |
| 231 | + end |
| 232 | + |
| 233 | + # Now we have aggregate prefix of all previous blocks, add it to all our elements |
| 234 | + ai = ithread |
| 235 | + if block_offset + ai < len |
| 236 | + v[block_offset + ai + 0x1] = op(running_prefix, v[block_offset + ai + 0x1]) |
| 237 | + end |
| 238 | + |
| 239 | + bi = ithread + block_size |
| 240 | + if block_offset + bi < len |
| 241 | + v[block_offset + bi + 0x1] = op(running_prefix, v[block_offset + bi + 0x1]) |
| 242 | + end |
| 243 | +end |
| 244 | + |
| 245 | + |
| 246 | +# DecoupledLookback algorithm |
195 | 247 | function accumulate_1d!( |
196 | | - op, v::AbstractArray, backend::GPU; |
| 248 | + op, v::AbstractArray, backend::GPU, ::DecoupledLookback; |
197 | 249 | init, |
198 | 250 | inclusive::Bool=true, |
199 | 251 |
|
@@ -242,3 +294,56 @@ function accumulate_1d!( |
242 | 294 |
|
243 | 295 | return v |
244 | 296 | end |
| 297 | + |
| 298 | + |
| 299 | +# ScanPrefixes algorithm |
| 300 | +function accumulate_1d!( |
| 301 | + op, v::AbstractArray, backend::GPU, ::ScanPrefixes; |
| 302 | + init, |
| 303 | + inclusive::Bool=true, |
| 304 | + |
| 305 | + block_size::Int=256, |
| 306 | + temp::Union{Nothing, AbstractArray}=nothing, |
| 307 | + temp_flags::Union{Nothing, AbstractArray}=nothing, |
| 308 | +) |
| 309 | + # Correctness checks |
| 310 | + @argcheck block_size > 0 |
| 311 | + @argcheck ispow2(block_size) |
| 312 | + |
| 313 | + # Nothing to accumulate |
| 314 | + if length(v) == 0 |
| 315 | + return v |
| 316 | + end |
| 317 | + |
| 318 | + # Each thread will process two elements |
| 319 | + elems_per_block = block_size * 2 |
| 320 | + num_blocks = (length(v) + elems_per_block - 1) ÷ elems_per_block |
| 321 | + |
| 322 | + if isnothing(temp) |
| 323 | + prefixes = similar(v, eltype(v), num_blocks) |
| 324 | + else |
| 325 | + @argcheck eltype(temp) === eltype(v) |
| 326 | + @argcheck length(temp) >= num_blocks |
| 327 | + prefixes = temp |
| 328 | + end |
| 329 | + |
| 330 | + kernel1! = _accumulate_block!(backend, block_size) |
| 331 | + kernel1!(op, v, init, inclusive, nothing, prefixes, |
| 332 | + ndrange=num_blocks * block_size) |
| 333 | + |
| 334 | + if num_blocks > 1 |
| 335 | + |
| 336 | + # Accumulate prefixes of all blocks |
| 337 | + num_blocks_prefixes = (length(prefixes) + elems_per_block - 1) ÷ elems_per_block |
| 338 | + kernel1!(op, prefixes, init, true, nothing, nothing, |
| 339 | + ndrange=num_blocks_prefixes * block_size) |
| 340 | + |
| 341 | + # Prefixes are pre-accumulated (completely accumulated if num_blocks_prefixes == 1, or |
| 342 | + # partially, which we will account for in the coupled lookback) |
| 343 | + kernel2! = _accumulate_previous_coupled_preblocks!(backend, block_size) |
| 344 | + kernel2!(op, v, prefixes, |
| 345 | + ndrange=(num_blocks - 1) * block_size) |
| 346 | + end |
| 347 | + |
| 348 | + return v |
| 349 | +end |
0 commit comments