Skip to content

Commit 8630a8b

Browse files
efaulhaberLasNikas
andauthored
Avoid bounds checking where it is safe to do so (#74)
* Make internals inbounds * Add inbounds to extract_svector * Remove bounds check for particle coords * Avoid one additional boundscheck * Update expected speedup for inbounds * Fix typo * Reformat code --------- Co-authored-by: Niklas Neher <[email protected]>
1 parent a44f7b3 commit 8630a8b

File tree

6 files changed

+64
-17
lines changed

6 files changed

+64
-17
lines changed

src/PointNeighbors.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ using Reexport: @reexport
44

55
using Adapt: Adapt
66
using Atomix: Atomix
7+
using Base: @propagate_inbounds
78
using GPUArraysCore: AbstractGPUArray
89
using KernelAbstractions: KernelAbstractions, @kernel, @index
910
using LinearAlgebra: dot

src/cell_lists/full_grid.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,15 +156,15 @@ function each_cell_index(cell_list::FullGridCellList{Nothing})
156156
error("`search_radius` is not defined for this cell list")
157157
end
158158

159-
@inline function cell_index(cell_list::FullGridCellList, cell::Tuple)
159+
@propagate_inbounds function cell_index(cell_list::FullGridCellList, cell::Tuple)
160160
(; linear_indices) = cell_list
161161

162162
return linear_indices[cell...]
163163
end
164164

165165
@inline cell_index(::FullGridCellList, cell::Integer) = cell
166166

167-
@inline function Base.getindex(cell_list::FullGridCellList, cell)
167+
@propagate_inbounds function Base.getindex(cell_list::FullGridCellList, cell)
168168
(; cells) = cell_list
169169

170170
return cells[cell_index(cell_list, cell)]

src/neighborhood_search.jl

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,13 @@ end
164164

165165
@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
166166
neighborhood_search, points, parallel::Val{true})
167+
# Explicit bounds check before the hot loop (or GPU kernel)
168+
@boundscheck checkbounds(system_coords, ndims(neighborhood_search))
169+
167170
@threaded system_coords for point in points
168-
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
171+
# Now we can assume that `point` is inbounds
172+
@inbounds foreach_neighbor(f, system_coords, neighbor_coords,
173+
neighborhood_search, point)
169174
end
170175

171176
return nothing
@@ -175,17 +180,27 @@ end
175180
@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
176181
neighborhood_search, points,
177182
backend::ParallelizationBackend)
183+
# Explicit bounds check before the hot loop (or GPU kernel)
184+
@boundscheck checkbounds(system_coords, ndims(neighborhood_search))
185+
178186
@threaded backend for point in points
179-
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
187+
# Now we can assume that `point` is inbounds
188+
@inbounds foreach_neighbor(f, system_coords, neighbor_coords,
189+
neighborhood_search, point)
180190
end
181191

182192
return nothing
183193
end
184194

185195
@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
186196
neighborhood_search, points, parallel::Val{false})
197+
# Explicit bounds check before the hot loop
198+
@boundscheck checkbounds(system_coords, ndims(neighborhood_search))
199+
187200
for point in points
188-
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
201+
# Now we can assume that `point` is inbounds
202+
@inbounds foreach_neighbor(f, system_coords, neighbor_coords,
203+
neighborhood_search, point)
189204
end
190205

191206
return nothing

src/nhs_grid.jl

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -343,18 +343,37 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle
343343
return neighborhood_search
344344
end
345345

346-
@inline function foreach_neighbor(f, system_coords, neighbor_system_coords,
347-
neighborhood_search::GridNeighborhoodSearch, point;
348-
search_radius = search_radius(neighborhood_search))
346+
@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords,
347+
neighborhood_search::GridNeighborhoodSearch,
348+
point;
349+
search_radius = search_radius(neighborhood_search))
350+
# Due to https://github.com/JuliaLang/julia/issues/30411, we cannot just remove
351+
# a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg.
352+
# We have to use `@propagate_inbounds`, which will also remove boundschecks
353+
# in the neighbor loop, which is not safe (see comment below).
354+
# To avoid this, we have to use a function barrier to disable the `@inbounds` again.
355+
point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point)
356+
357+
__foreach_neighbor(f, system_coords, neighbor_system_coords, neighborhood_search,
358+
point, point_coords, search_radius)
359+
end
360+
361+
@inline function __foreach_neighbor(f, system_coords, neighbor_system_coords,
362+
neighborhood_search::GridNeighborhoodSearch,
363+
point, point_coords, search_radius)
349364
(; periodic_box) = neighborhood_search
350365

351-
point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point)
352366
cell = cell_coords(point_coords, neighborhood_search)
353367

354368
for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
355369
neighbor_cell = Tuple(neighbor_cell_)
370+
neighbors = points_in_cell(neighbor_cell, neighborhood_search)
371+
372+
for neighbor_ in eachindex(neighbors)
373+
neighbor = @inbounds neighbors[neighbor_]
356374

357-
for neighbor in points_in_cell(neighbor_cell, neighborhood_search)
375+
# Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100.
376+
# But we don't know if `neighbor` (extracted from the cell list) is in bounds.
358377
neighbor_coords = extract_svector(neighbor_system_coords,
359378
Val(ndims(neighborhood_search)), neighbor)
360379

@@ -392,7 +411,7 @@ end
392411
for cell in neighboring_cells(cell, neighborhood_search))
393412
end
394413

395-
@inline function points_in_cell(cell_index, neighborhood_search)
414+
@propagate_inbounds function points_in_cell(cell_index, neighborhood_search)
396415
(; cell_list) = neighborhood_search
397416

398417
return cell_list[periodic_cell_index(cell_index, neighborhood_search)]

src/util.jl

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
# Return the `i`-th column of the array `A` as an `SVector`.
22
@inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS}
3-
return SVector(ntuple(@inline(dim->A[dim, i]), NDIMS))
3+
# Explicit bounds check, which can be removed by calling this function with `@inbounds`
4+
@boundscheck checkbounds(A, NDIMS, i)
5+
6+
# Assume inbounds access now
7+
return SVector(ntuple(@inline(dim->@inbounds A[dim, i]), NDIMS))
48
end
59

610
# When particles end up with coordinates so big that the cell coordinates
@@ -13,13 +17,20 @@ end
1317
# If we threw an error here, we would prevent the time integration method from
1418
# retrying with a smaller time step, and we would thus crash perfectly fine simulations.
1519
@inline function floor_to_int(i)
16-
if isnan(i) || i > typemax(Int)
20+
# `Base.floor(Int, i)` is defined as `trunc(Int, round(x, RoundDown))`
21+
rounded = round(i, RoundDown)
22+
23+
# `Base.trunc(Int, x)` throws an `InexactError` in these cases, and otherwise
24+
# returns `unsafe_trunc(Int, rounded)`.
25+
if isnan(rounded) || rounded >= typemax(Int)
1726
return typemax(Int)
18-
elseif i < typemin(Int)
27+
elseif rounded <= typemin(Int)
1928
return typemin(Int)
2029
end
2130

22-
return floor(Int, i)
31+
# After making sure that `rounded` is in the range of `Int`,
32+
# we can safely call `unsafe_trunc`.
33+
return unsafe_trunc(Int, rounded)
2334
end
2435

2536
abstract type AbstractThreadingBackend end
@@ -134,7 +145,7 @@ end
134145
# Call the generic kernel that is defined below, which only calls a function with
135146
# the global GPU index.
136147
generic_kernel(backend)(ndrange = ndrange) do i
137-
@inline f(iterator[indices[i]])
148+
@inbounds @inline f(iterator[indices[i]])
138149
end
139150

140151
KernelAbstractions.synchronize(backend)

src/vector_of_vectors.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@ end
2626
@inline function Base.getindex(vov::DynamicVectorOfVectors, i)
2727
(; backend, lengths) = vov
2828

29+
# This is slightly faster than without explicit boundscheck and `@inbounds` below
2930
@boundscheck checkbounds(vov, i)
3031

31-
return view(backend, 1:lengths[i], i)
32+
return @inbounds view(backend, 1:lengths[i], i)
3233
end
3334

3435
@inline function Base.push!(vov::DynamicVectorOfVectors, vector::AbstractVector)

0 commit comments

Comments
 (0)