Skip to content
Merged
1 change: 1 addition & 0 deletions src/PointNeighbors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ using Reexport: @reexport

using Adapt: Adapt
using Atomix: Atomix
using Base: @propagate_inbounds
using GPUArraysCore: AbstractGPUArray
using KernelAbstractions: KernelAbstractions, @kernel, @index
using LinearAlgebra: dot
Expand Down
4 changes: 2 additions & 2 deletions src/cell_lists/full_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -156,15 +156,15 @@ function each_cell_index(cell_list::FullGridCellList{Nothing})
error("`search_radius` is not defined for this cell list")
end

@inline function cell_index(cell_list::FullGridCellList, cell::Tuple)
@propagate_inbounds function cell_index(cell_list::FullGridCellList, cell::Tuple)
(; linear_indices) = cell_list

return linear_indices[cell...]
end

@inline cell_index(::FullGridCellList, cell::Integer) = cell

@inline function Base.getindex(cell_list::FullGridCellList, cell)
@propagate_inbounds function Base.getindex(cell_list::FullGridCellList, cell)
(; cells) = cell_list

return cells[cell_index(cell_list, cell)]
Expand Down
21 changes: 18 additions & 3 deletions src/neighborhood_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,13 @@ end

@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, points, parallel::Val{true})
# Explicit bounds check before the hot loop (or GPU kernel)
@boundscheck checkbounds(system_coords, ndims(neighborhood_search))

@threaded system_coords for point in points
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
# Now we can assume that `point` is inbounds
@inbounds foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, point)
end

return nothing
Expand All @@ -175,17 +180,27 @@ end
@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, points,
backend::ParallelizationBackend)
# Explicit bounds check before the hot loop (or GPU kernel)
@boundscheck checkbounds(system_coords, ndims(neighborhood_search))

@threaded backend for point in points
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
# Now we can assume that `point` is inbounds
@inbounds foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, point)
end

return nothing
end

@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, points, parallel::Val{false})
# Explicit bounds check before the hot loop
@boundscheck checkbounds(system_coords, ndims(neighborhood_search))

for point in points
foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
# Now we can assume that `point` is inbounds
@inbounds foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, point)
end

return nothing
Expand Down
31 changes: 25 additions & 6 deletions src/nhs_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -343,18 +343,37 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle
return neighborhood_search
end

@inline function foreach_neighbor(f, system_coords, neighbor_system_coords,
neighborhood_search::GridNeighborhoodSearch, point;
search_radius = search_radius(neighborhood_search))
@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords,
neighborhood_search::GridNeighborhoodSearch,
point;
search_radius = search_radius(neighborhood_search))
# Due to https://github.com/JuliaLang/julia/issues/30411, we cannot just remove
# a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg.
# We have to use `@propagate_inbounds`, which will also remove boundschecks
# in the neighbor loop, which is not safe (see comment below).
# To avoid this, we have to use a function barrier to disable the `@inbounds` again.
point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point)

__foreach_neighbor(f, system_coords, neighbor_system_coords, neighborhood_search,
point, point_coords, search_radius)
end

@inline function __foreach_neighbor(f, system_coords, neighbor_system_coords,
neighborhood_search::GridNeighborhoodSearch,
point, point_coords, search_radius)
(; periodic_box) = neighborhood_search

point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point)
cell = cell_coords(point_coords, neighborhood_search)

for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
neighbor_cell = Tuple(neighbor_cell_)
neighbors = points_in_cell(neighbor_cell, neighborhood_search)

for neighbor_ in eachindex(neighbors)
neighbor = @inbounds neighbors[neighbor_]

for neighbor in points_in_cell(neighbor_cell, neighborhood_search)
# Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100.
# But we don't know if `neighbor` (extracted from the cell list) is in bounds.
neighbor_coords = extract_svector(neighbor_system_coords,
Val(ndims(neighborhood_search)), neighbor)

Expand Down Expand Up @@ -392,7 +411,7 @@ end
for cell in neighboring_cells(cell, neighborhood_search))
end

@inline function points_in_cell(cell_index, neighborhood_search)
@propagate_inbounds function points_in_cell(cell_index, neighborhood_search)
(; cell_list) = neighborhood_search

return cell_list[periodic_cell_index(cell_index, neighborhood_search)]
Expand Down
21 changes: 16 additions & 5 deletions src/util.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Return the `i`-th column of the array `A` as an `SVector`.
@inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS}
return SVector(ntuple(@inline(dim->A[dim, i]), NDIMS))
# Explicit bounds check, which can be removed by calling this function with `@inbounds`
@boundscheck checkbounds(A, NDIMS, i)

# Assume inbounds access now
return SVector(ntuple(@inline(dim->@inbounds A[dim, i]), NDIMS))
end

# When particles end up with coordinates so big that the cell coordinates
Expand All @@ -13,13 +17,20 @@ end
# If we threw an error here, we would prevent the time integration method from
# retrying with a smaller time step, and we would thus crash perfectly fine simulations.
@inline function floor_to_int(i)
if isnan(i) || i > typemax(Int)
# `Base.floor(Int, i)` is defined as `trunc(Int, round(x, RoundDown))`
rounded = round(i, RoundDown)

# `Base.trunc(Int, x)` throws an `InexactError` in these cases, and otherwise
# returns `unsafe_trunc(Int, rounded)`.
if isnan(rounded) || rounded >= typemax(Int)
return typemax(Int)
elseif i < typemin(Int)
elseif rounded <= typemin(Int)
return typemin(Int)
end

return floor(Int, i)
# After making sure that `rounded` is in the range of `Int`,
# we can safely call `unsafe_trunc`.
return unsafe_trunc(Int, rounded)
end

abstract type AbstractThreadingBackend end
Expand Down Expand Up @@ -134,7 +145,7 @@ end
# Call the generic kernel that is defined below, which only calls a function with
# the global GPU index.
generic_kernel(backend)(ndrange = ndrange) do i
@inline f(iterator[indices[i]])
@inbounds @inline f(iterator[indices[i]])
end

KernelAbstractions.synchronize(backend)
Expand Down
3 changes: 2 additions & 1 deletion src/vector_of_vectors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ end
@inline function Base.getindex(vov::DynamicVectorOfVectors, i)
(; backend, lengths) = vov

# This is slightly faster than without explicit boundscheck and `@inbounds` below
@boundscheck checkbounds(vov, i)

return view(backend, 1:lengths[i], i)
return @inbounds view(backend, 1:lengths[i], i)
end

@inline function Base.push!(vov::DynamicVectorOfVectors, vector::AbstractVector)
Expand Down