diff --git a/NEWS.md b/NEWS.md index b87a369b042..cb012f8f6ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -115,7 +115,7 @@ for human readability. #### Changed -- The numerical solution is wrapped in a `VectorOfArrays` from +- The numerical solution is wrapped in a `VectorOfArray` from [RecursiveArrayTools.jl](https://github.com/SciML/RecursiveArrayTools.jl) for `DGMulti` solvers ([#2150]). You can use `Base.parent` to unwrap the original data. diff --git a/Project.toml b/Project.toml index 27f5bbca28e..f62f0d0c28c 100644 --- a/Project.toml +++ b/Project.toml @@ -99,7 +99,7 @@ PrecompileTools = "1.2" Preferences = "1.4" Printf = "1" RecipesBase = "1.3.4" -RecursiveArrayTools = "3.31.1" +RecursiveArrayTools = "3.34.1" Reexport = "1.2" Requires = "1.3" SciMLBase = "2.67.0" diff --git a/src/Trixi.jl b/src/Trixi.jl index 28d32535be7..0671d5aa8d9 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -134,7 +134,6 @@ include("basic_types.jl") # Include all top-level source files include("auxiliary/auxiliary.jl") -include("auxiliary/vector_of_arrays.jl") include("auxiliary/mpi.jl") include("auxiliary/p4est.jl") include("auxiliary/t8code.jl") diff --git a/src/auxiliary/vector_of_arrays.jl b/src/auxiliary/vector_of_arrays.jl deleted file mode 100644 index 0fa8dd7f1ec..00000000000 --- a/src/auxiliary/vector_of_arrays.jl +++ /dev/null @@ -1,31 +0,0 @@ -# By default, Julia/LLVM does not use fused multiply-add operations (FMAs). -# Since these FMAs can increase the performance of many numerical algorithms, -# we need to opt-in explicitly. -# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details. -@muladd begin -#! format: noindent - -# Wraps a Vector of Arrays, forwards `getindex` to the underlying Vector. -# Implements `Adapt.adapt_structure` to allow offloading to the GPU which is -# not possible for a plain Vector of Arrays. -struct VecOfArrays{T <: AbstractArray} - arrays::Vector{T} -end -Base.getindex(v::VecOfArrays, i::Int) = Base.getindex(v.arrays, i) -Base.IndexStyle(v::VecOfArrays) = Base.IndexStyle(v.arrays) -Base.size(v::VecOfArrays) = Base.size(v.arrays) -Base.length(v::VecOfArrays) = Base.length(v.arrays) -Base.eltype(v::VecOfArrays{T}) where {T} = T -function Adapt.adapt_structure(to, v::VecOfArrays) - return VecOfArrays([Adapt.adapt(to, arr) for arr in v.arrays]) -end -function Adapt.parent_type(::Type{<:VecOfArrays{T}}) where {T} - return T -end -function Adapt.unwrap_type(A::Type{<:VecOfArrays}) - Adapt.unwrap_type(Adapt.parent_type(A)) -end -function Base.convert(::Type{<:VecOfArrays}, v::Vector{<:AbstractArray}) - VecOfArrays(v) -end -end # @muladd diff --git a/src/solvers/dgsem_p4est/containers_parallel.jl b/src/solvers/dgsem_p4est/containers_parallel.jl index 4b6fc813703..bfcd64d9485 100644 --- a/src/solvers/dgsem_p4est/containers_parallel.jl +++ b/src/solvers/dgsem_p4est/containers_parallel.jl @@ -225,7 +225,7 @@ function Adapt.adapt_structure(to, mpi_mortars::P4estMPIMortarContainer) # TODO: GPU # Only parts of this container are adapted, since we currently don't # use `local_neighbor_ids`, `local_neighbor_positions`, `normal_directions` - # on the GPU. If we do need them we need to redesign this to use the VecOfArrays + # on the GPU. If we do need them we need to redesign this to use the VectorOfArray # approach. _u = adapt(to, mpi_mortars._u) diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl index 4c099c9fd3f..9c59549bcb2 100644 --- a/src/solvers/dgsem_p4est/dg_3d.jl +++ b/src/solvers/dgsem_p4est/dg_3d.jl @@ -10,21 +10,24 @@ function create_cache(mesh::Union{P4estMesh{3}, T8codeMesh{3}}, equations, mortar_l2::LobattoLegendreMortarL2, uEltype) # TODO: Taal compare performance of different types - fstar_primary_threaded = [Array{uEltype, 4}(undef, nvariables(equations), - nnodes(mortar_l2), - nnodes(mortar_l2), 4) - for _ in 1:Threads.nthreads()] |> VecOfArrays - fstar_secondary_threaded = [Array{uEltype, 4}(undef, nvariables(equations), + fstar_primary_threaded = VectorOfArray([Array{uEltype, 4}(undef, + nvariables(equations), + nnodes(mortar_l2), + nnodes(mortar_l2), 4) + for _ in 1:Threads.nthreads()]) + fstar_secondary_threaded = VectorOfArray([Array{uEltype, 4}(undef, + nvariables(equations), + nnodes(mortar_l2), + nnodes(mortar_l2), 4) + for _ in 1:Threads.nthreads()]) + fstar_tmp_threaded = VectorOfArray([Array{uEltype, 3}(undef, nvariables(equations), + nnodes(mortar_l2), + nnodes(mortar_l2)) + for _ in 1:Threads.nthreads()]) + u_threaded = VectorOfArray([Array{uEltype, 3}(undef, nvariables(equations), nnodes(mortar_l2), - nnodes(mortar_l2), 4) - for _ in 1:Threads.nthreads()] |> VecOfArrays - - fstar_tmp_threaded = [Array{uEltype, 3}(undef, nvariables(equations), - nnodes(mortar_l2), nnodes(mortar_l2)) - for _ in 1:Threads.nthreads()] |> VecOfArrays - u_threaded = [Array{uEltype, 3}(undef, nvariables(equations), nnodes(mortar_l2), - nnodes(mortar_l2)) - for _ in 1:Threads.nthreads()] |> VecOfArrays + nnodes(mortar_l2)) + for _ in 1:Threads.nthreads()]) (; fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded, u_threaded) end @@ -519,9 +522,9 @@ function prolong2mortars!(cache, u, # Buffer to copy solution values of the large element in the correct orientation # before interpolating - u_buffer = cache.u_threaded[Threads.threadid()] + u_buffer = cache.u_threaded.u[Threads.threadid()] # temporary buffer for projections - fstar_tmp = fstar_tmp_threaded[Threads.threadid()] + fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()] # Copy solution of large element face to buffer in the # correct orientation @@ -590,9 +593,9 @@ function calc_mortar_flux!(surface_flux_values, @threaded for mortar in eachmortar(dg, cache) # Choose thread-specific pre-allocated container - fstar_primary = fstar_primary_threaded[Threads.threadid()] - fstar_secondary = fstar_secondary_threaded[Threads.threadid()] - fstar_tmp = fstar_tmp_threaded[Threads.threadid()] + fstar_primary = fstar_primary_threaded.u[Threads.threadid()] + fstar_secondary = fstar_secondary_threaded.u[Threads.threadid()] + fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()] # Get index information on the small elements small_indices = node_indices[1, mortar] @@ -638,7 +641,7 @@ function calc_mortar_flux!(surface_flux_values, # Buffer to interpolate flux values of the large element to before # copying in the correct orientation - u_buffer = cache.u_threaded[Threads.threadid()] + u_buffer = cache.u_threaded.u[Threads.threadid()] # in calc_interface_flux!, the interface flux is computed once over each # interface using the normal from the "primary" element. The result is then diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl index 1fe22e42fe7..06917ba17e3 100644 --- a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl +++ b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl @@ -697,10 +697,10 @@ function prolong2mortars_divergence!(cache, flux_viscous, # Buffer to copy solution values of the large element in the correct orientation # before interpolating - u_buffer = cache.u_threaded[Threads.threadid()] + u_buffer = cache.u_threaded.u[Threads.threadid()] # temporary buffer for projections - fstar_tmp = fstar_tmp_threaded[Threads.threadid()] + fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()] # Copy solution of large element face to buffer in the # correct orientation @@ -787,8 +787,8 @@ function calc_mortar_flux_divergence!(surface_flux_values, @threaded for mortar in eachmortar(dg, cache) # Choose thread-specific pre-allocated container - fstar = fstar_primary_threaded[Threads.threadid()] - fstar_tmp = fstar_tmp_threaded[Threads.threadid()] + fstar = fstar_primary_threaded.u[Threads.threadid()] + fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()] # Get index information on the small elements small_indices = node_indices[1, mortar] @@ -831,7 +831,7 @@ function calc_mortar_flux_divergence!(surface_flux_values, # Buffer to interpolate flux values of the large element to before # copying in the correct orientation - u_buffer = cache.u_threaded[Threads.threadid()] + u_buffer = cache.u_threaded.u[Threads.threadid()] # this reuses the hyperbolic version of `mortar_fluxes_to_elements!` mortar_fluxes_to_elements!(surface_flux_values, diff --git a/src/solvers/dgsem_p4est/dg_3d_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parallel.jl index aad54f40afb..07a0a0ba1f6 100644 --- a/src/solvers/dgsem_p4est/dg_3d_parallel.jl +++ b/src/solvers/dgsem_p4est/dg_3d_parallel.jl @@ -335,9 +335,9 @@ function prolong2mpimortars!(cache, u, if position == 5 # -> large element # Buffer to copy solution values of the large element in the correct orientation # before interpolating - u_buffer = cache.u_threaded[Threads.threadid()] + u_buffer = cache.u_threaded.u[Threads.threadid()] # temporary buffer for projections - fstar_tmp = cache.fstar_tmp_threaded[Threads.threadid()] + fstar_tmp = cache.fstar_tmp_threaded.u[Threads.threadid()] i_large = i_large_start j_large = j_large_start @@ -423,9 +423,9 @@ function calc_mpi_mortar_flux!(surface_flux_values, @threaded for mortar in eachmpimortar(dg, cache) # Choose thread-specific pre-allocated container - fstar_primary = fstar_primary_threaded[Threads.threadid()] - fstar_secondary = fstar_secondary_threaded[Threads.threadid()] - fstar_tmp = fstar_tmp_threaded[Threads.threadid()] + fstar_primary = fstar_primary_threaded.u[Threads.threadid()] + fstar_secondary = fstar_secondary_threaded.u[Threads.threadid()] + fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()] # Get index information on the small elements small_indices = node_indices[1, mortar] @@ -465,7 +465,7 @@ function calc_mpi_mortar_flux!(surface_flux_values, # Buffer to interpolate flux values of the large element to before # copying in the correct orientation - u_buffer = cache.u_threaded[Threads.threadid()] + u_buffer = cache.u_threaded.u[Threads.threadid()] mpi_mortar_fluxes_to_elements!(surface_flux_values, mesh, equations, mortar_l2, dg, cache, diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl index 7acddf07b4b..a3a4005ad96 100644 --- a/src/solvers/dgsem_p4est/dg_parallel.jl +++ b/src/solvers/dgsem_p4est/dg_parallel.jl @@ -5,13 +5,12 @@ @muladd begin #! format: noindent -mutable struct P4estMPICache{BufferType <: DenseVector, - VecInt <: DenseVector{<:Integer}} +mutable struct P4estMPICache{uEltype} mpi_neighbor_ranks::Vector{Int} - mpi_neighbor_interfaces::VecOfArrays{VecInt} - mpi_neighbor_mortars::VecOfArrays{VecInt} - mpi_send_buffers::VecOfArrays{BufferType} - mpi_recv_buffers::VecOfArrays{BufferType} + mpi_neighbor_interfaces::VectorOfArray{Int, 2, Vector{Vector{Int}}} + mpi_neighbor_mortars::VectorOfArray{Int, 2, Vector{Vector{Int}}} + mpi_send_buffers::VectorOfArray{uEltype, 2, Vector{Vector{uEltype}}} + mpi_recv_buffers::VectorOfArray{uEltype, 2, Vector{Vector{uEltype}}} mpi_send_requests::Vector{MPI.Request} mpi_recv_requests::Vector{MPI.Request} n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}} @@ -26,23 +25,23 @@ function P4estMPICache(uEltype) end mpi_neighbor_ranks = Vector{Int}(undef, 0) - mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0) |> VecOfArrays - mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0) |> VecOfArrays - mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays - mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays + mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef, 0)) + mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef, 0)) + mpi_send_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, 0)) + mpi_recv_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, 0)) mpi_send_requests = Vector{MPI.Request}(undef, 0) mpi_recv_requests = Vector{MPI.Request}(undef, 0) n_elements_by_rank = OffsetArray(Vector{Int}(undef, 0), 0:-1) n_elements_global = 0 first_element_global_id = 0 - P4estMPICache{Vector{uEltype}, Vector{Int}}(mpi_neighbor_ranks, - mpi_neighbor_interfaces, - mpi_neighbor_mortars, - mpi_send_buffers, mpi_recv_buffers, - mpi_send_requests, mpi_recv_requests, - n_elements_by_rank, n_elements_global, - first_element_global_id) + P4estMPICache{uEltype}(mpi_neighbor_ranks, + mpi_neighbor_interfaces, + mpi_neighbor_mortars, + mpi_send_buffers, mpi_recv_buffers, + mpi_send_requests, mpi_recv_requests, + n_elements_by_rank, n_elements_global, + first_element_global_id) end @inline Base.eltype(::P4estMPICache{BufferType}) where {BufferType} = eltype(BufferType) @@ -65,9 +64,9 @@ function start_mpi_send!(mpi_cache::P4estMPICache, mesh, equations, dg, cache) n_small_elements = 2^(ndims(mesh) - 1) for rank in 1:length(mpi_cache.mpi_neighbor_ranks) - send_buffer = mpi_cache.mpi_send_buffers[rank] + send_buffer = mpi_cache.mpi_send_buffers.u[rank] - for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces[rank]) + for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces.u[rank]) first = (index - 1) * data_size + 1 last = (index - 1) * data_size + data_size local_side = cache.mpi_interfaces.local_sides[interface] @@ -77,15 +76,15 @@ function start_mpi_send!(mpi_cache::P4estMPICache, mesh, equations, dg, cache) # Set send_buffer corresponding to mortar data to NaN and overwrite the parts where local # data exists - interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces[rank]) * + interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces.u[rank]) * data_size - mortars_data_size = length(mpi_cache.mpi_neighbor_mortars[rank]) * + mortars_data_size = length(mpi_cache.mpi_neighbor_mortars.u[rank]) * n_small_elements * 2 * data_size # `NaN |> eltype(...)` ensures that the NaN's are of the appropriate floating point type send_buffer[(interfaces_data_size + 1):(interfaces_data_size + mortars_data_size)] .= NaN |> eltype(mpi_cache) - for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars[rank]) + for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars.u[rank]) index_base = interfaces_data_size + (index - 1) * n_small_elements * 2 * data_size indices = buffer_mortar_indices(mesh, index_base, data_size) @@ -108,7 +107,7 @@ function start_mpi_send!(mpi_cache::P4estMPICache, mesh, equations, dg, cache) # Start sending for (index, rank) in enumerate(mpi_cache.mpi_neighbor_ranks) - mpi_cache.mpi_send_requests[index] = MPI.Isend(mpi_cache.mpi_send_buffers[index], + mpi_cache.mpi_send_requests[index] = MPI.Isend(mpi_cache.mpi_send_buffers.u[index], rank, mpi_rank(), mpi_comm()) end @@ -117,7 +116,7 @@ end function start_mpi_receive!(mpi_cache::P4estMPICache) for (index, rank) in enumerate(mpi_cache.mpi_neighbor_ranks) - mpi_cache.mpi_recv_requests[index] = MPI.Irecv!(mpi_cache.mpi_recv_buffers[index], + mpi_cache.mpi_recv_requests[index] = MPI.Irecv!(mpi_cache.mpi_recv_buffers.u[index], rank, rank, mpi_comm()) end @@ -136,9 +135,9 @@ function finish_mpi_receive!(mpi_cache::P4estMPICache, mesh, equations, dg, cach # Start receiving and unpack received data until all communication is finished data = MPI.Waitany(mpi_cache.mpi_recv_requests) while data !== nothing - recv_buffer = mpi_cache.mpi_recv_buffers[data] + recv_buffer = mpi_cache.mpi_recv_buffers.u[data] - for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces[data]) + for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces.u[data]) first = (index - 1) * data_size + 1 last = (index - 1) * data_size + data_size @@ -149,9 +148,9 @@ function finish_mpi_receive!(mpi_cache::P4estMPICache, mesh, equations, dg, cach end end - interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces[data]) * + interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces.u[data]) * data_size - for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars[data]) + for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars.u[data]) index_base = interfaces_data_size + (index - 1) * n_small_elements * 2 * data_size indices = buffer_mortar_indices(mesh, index_base, data_size) @@ -270,16 +269,20 @@ end function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh, mpi_interfaces, mpi_mortars, nvars, n_nodes, uEltype) - mpi_neighbor_ranks, _mpi_neighbor_interfaces, _mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces, - mpi_mortars, - mesh) - - _mpi_send_buffers, _mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(_mpi_neighbor_interfaces, - _mpi_neighbor_mortars, - ndims(mesh), - nvars, - n_nodes, - uEltype) + mpi_neighbor_ranks, + mpi_neighbor_interfaces, + mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces, + mpi_mortars, + mesh) + mpi_send_buffers, + mpi_recv_buffers, + mpi_send_requests, + mpi_recv_requests = init_mpi_data_structures(mpi_neighbor_interfaces, + mpi_neighbor_mortars, + ndims(mesh), + nvars, + n_nodes, + uEltype) # Determine local and total number of elements n_elements_global = Int(mesh.p4est.global_num_quadrants[]) @@ -291,11 +294,6 @@ function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh, first_element_global_id = Int(mesh.p4est.global_first_quadrant[mpi_rank() + 1]) + 1 @assert n_elements_global==sum(n_elements_by_rank) "error in total number of elements" - mpi_neighbor_interfaces = VecOfArrays(_mpi_neighbor_interfaces) - mpi_neighbor_mortars = VecOfArrays(_mpi_neighbor_mortars) - mpi_send_buffers = VecOfArrays(_mpi_send_buffers) - mpi_recv_buffers = VecOfArrays(_mpi_recv_buffers) - # TODO reuse existing structures @pack! mpi_cache = mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars, @@ -331,17 +329,19 @@ function init_mpi_neighbor_connectivity(mpi_interfaces, mpi_mortars, mortar_ids = collect(1:nmpimortars(mpi_mortars))[p] # For each neighbor rank, init connectivity data structures - mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks)) - mpi_neighbor_mortars = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks)) + mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef, + length(mpi_neighbor_ranks))) + mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef, + length(mpi_neighbor_ranks))) for (index, rank) in enumerate(mpi_neighbor_ranks) - mpi_neighbor_interfaces[index] = interface_ids[findall(==(rank), - neighbor_ranks_interface)] - mpi_neighbor_mortars[index] = mortar_ids[findall(x -> (rank in x), - neighbor_ranks_mortar)] + mpi_neighbor_interfaces.u[index] = interface_ids[findall(==(rank), + neighbor_ranks_interface)] + mpi_neighbor_mortars.u[index] = mortar_ids[findall(x -> (rank in x), + neighbor_ranks_mortar)] end # Check that all interfaces were counted exactly once - @assert mapreduce(length, +, mpi_neighbor_interfaces; init = 0) == + @assert mapreduce(length, +, mpi_neighbor_interfaces.u; init = 0) == nmpiinterfaces(mpi_interfaces) return mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars @@ -525,26 +525,26 @@ function exchange_normal_directions!(mpi_mortars, mpi_cache, data_size = n_nodes^(n_dims - 1) * n_dims # Create buffers and requests - send_buffers = Vector{Vector{RealT}}(undef, length(mpi_neighbor_mortars)) - recv_buffers = Vector{Vector{RealT}}(undef, length(mpi_neighbor_mortars)) - for index in 1:length(mpi_neighbor_mortars) + send_buffers = Vector{Vector{RealT}}(undef, length(mpi_neighbor_mortars.u)) + recv_buffers = Vector{Vector{RealT}}(undef, length(mpi_neighbor_mortars.u)) + for index in 1:length(mpi_neighbor_mortars.u) send_buffers[index] = Vector{RealT}(undef, - length(mpi_neighbor_mortars[index]) * + length(mpi_neighbor_mortars.u[index]) * n_small_elements * data_size) send_buffers[index] .= NaN |> RealT recv_buffers[index] = Vector{RealT}(undef, - length(mpi_neighbor_mortars[index]) * + length(mpi_neighbor_mortars.u[index]) * n_small_elements * data_size) recv_buffers[index] .= NaN |> RealT end - send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_mortars)) - recv_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_mortars)) + send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_mortars.u)) + recv_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_mortars.u)) # Fill send buffers for rank in 1:length(mpi_neighbor_ranks) send_buffer = send_buffers[rank] - for (index, mortar) in enumerate(mpi_neighbor_mortars[rank]) + for (index, mortar) in enumerate(mpi_neighbor_mortars.u[rank]) index_base = (index - 1) * n_small_elements * data_size indices = buffer_mortar_indices(mesh, index_base, data_size) for position in mpi_mortars.local_neighbor_positions[mortar] @@ -571,7 +571,7 @@ function exchange_normal_directions!(mpi_mortars, mpi_cache, while data !== nothing recv_buffer = recv_buffers[data] - for (index, mortar) in enumerate(mpi_neighbor_mortars[data]) + for (index, mortar) in enumerate(mpi_neighbor_mortars.u[data]) index_base = (index - 1) * n_small_elements * data_size indices = buffer_mortar_indices(mesh, index_base, data_size) for position in 1:n_small_elements diff --git a/src/solvers/dgsem_t8code/dg_parallel.jl b/src/solvers/dgsem_t8code/dg_parallel.jl index 729d6ec0392..e4f6af2591b 100644 --- a/src/solvers/dgsem_t8code/dg_parallel.jl +++ b/src/solvers/dgsem_t8code/dg_parallel.jl @@ -116,17 +116,19 @@ function init_mpi_neighbor_connectivity(mpi_mesh_info, mesh::ParallelT8codeMesh) mortar_ids = collect(1:nmpimortars(mpi_mortars))[p] # For each neighbor rank, init connectivity data structures - mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks)) - mpi_neighbor_mortars = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks)) + mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef, + length(mpi_neighbor_ranks))) + mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef, + length(mpi_neighbor_ranks))) for (index, rank) in enumerate(mpi_neighbor_ranks) - mpi_neighbor_interfaces[index] = interface_ids[findall(==(rank), - neighbor_ranks_interface)] - mpi_neighbor_mortars[index] = mortar_ids[findall(x -> (rank in x), - neighbor_ranks_mortar)] + mpi_neighbor_interfaces.u[index] = interface_ids[findall(==(rank), + neighbor_ranks_interface)] + mpi_neighbor_mortars.u[index] = mortar_ids[findall(x -> (rank in x), + neighbor_ranks_mortar)] end # Check that all interfaces were counted exactly once - @assert mapreduce(length, +, mpi_neighbor_interfaces; init = 0) == + @assert mapreduce(length, +, mpi_neighbor_interfaces.u; init = 0) == nmpiinterfaces(mpi_interfaces) return mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars diff --git a/src/solvers/dgsem_tree/dg_2d_parallel.jl b/src/solvers/dgsem_tree/dg_2d_parallel.jl index cb522aa3eaa..03274490ee0 100644 --- a/src/solvers/dgsem_tree/dg_2d_parallel.jl +++ b/src/solvers/dgsem_tree/dg_2d_parallel.jl @@ -11,10 +11,10 @@ # TODO: MPI dimension agnostic mutable struct MPICache{uEltype <: Real} mpi_neighbor_ranks::Vector{Int} - mpi_neighbor_interfaces::Vector{Vector{Int}} - mpi_neighbor_mortars::Vector{Vector{Int}} - mpi_send_buffers::Vector{Vector{uEltype}} - mpi_recv_buffers::Vector{Vector{uEltype}} + mpi_neighbor_interfaces::VectorOfArray{Int, 2, Vector{Vector{Int}}} + mpi_neighbor_mortars::VectorOfArray{Int, 2, Vector{Vector{Int}}} + mpi_send_buffers::VectorOfArray{uEltype, 2, Vector{Vector{uEltype}}} + mpi_recv_buffers::VectorOfArray{uEltype, 2, Vector{Vector{uEltype}}} mpi_send_requests::Vector{MPI.Request} mpi_recv_requests::Vector{MPI.Request} n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}} @@ -28,10 +28,10 @@ function MPICache(uEltype) throw(ArgumentError("MPICache only supports bitstypes, $uEltype is not a bitstype.")) end mpi_neighbor_ranks = Vector{Int}(undef, 0) - mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0) - mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0) - mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0) - mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0) + mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef, 0)) + mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef, 0)) + mpi_send_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, 0)) + mpi_recv_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, 0)) mpi_send_requests = Vector{MPI.Request}(undef, 0) mpi_recv_requests = Vector{MPI.Request}(undef, 0) n_elements_by_rank = OffsetArray(Vector{Int}(undef, 0), 0:-1) @@ -49,7 +49,7 @@ end # TODO: MPI dimension agnostic function start_mpi_receive!(mpi_cache::MPICache) for (index, rank) in enumerate(mpi_cache.mpi_neighbor_ranks) - mpi_cache.mpi_recv_requests[index] = MPI.Irecv!(mpi_cache.mpi_recv_buffers[index], + mpi_cache.mpi_recv_requests[index] = MPI.Irecv!(mpi_cache.mpi_recv_buffers.u[index], rank, rank, mpi_comm()) end @@ -61,9 +61,9 @@ function start_mpi_send!(mpi_cache::MPICache, mesh, equations, dg, cache) data_size = nvariables(equations) * nnodes(dg)^(ndims(mesh) - 1) for rank in 1:length(mpi_cache.mpi_neighbor_ranks) - send_buffer = mpi_cache.mpi_send_buffers[rank] + send_buffer = mpi_cache.mpi_send_buffers.u[rank] - for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces[rank]) + for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces.u[rank]) first = (index - 1) * data_size + 1 last = (index - 1) * data_size + data_size @@ -78,12 +78,13 @@ function start_mpi_send!(mpi_cache::MPICache, mesh, equations, dg, cache) # Each mortar has a total size of 4 * data_size, set everything to NaN first and overwrite the # parts where local data exists - interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces[rank]) * + interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces.u[rank]) * data_size - mortars_data_size = length(mpi_cache.mpi_neighbor_mortars[rank]) * 4 * data_size + mortars_data_size = length(mpi_cache.mpi_neighbor_mortars.u[rank]) * 4 * + data_size send_buffer[(interfaces_data_size + 1):(interfaces_data_size + mortars_data_size)] .= NaN - for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars[rank]) + for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars.u[rank]) # First and last indices in the send buffer for mortar data obtained from local element # in a given position index_base = interfaces_data_size + (index - 1) * 4 * data_size @@ -145,7 +146,7 @@ function start_mpi_send!(mpi_cache::MPICache, mesh, equations, dg, cache) # Start sending for (index, rank) in enumerate(mpi_cache.mpi_neighbor_ranks) - mpi_cache.mpi_send_requests[index] = MPI.Isend(mpi_cache.mpi_send_buffers[index], + mpi_cache.mpi_send_requests[index] = MPI.Isend(mpi_cache.mpi_send_buffers.u[index], rank, mpi_rank(), mpi_comm()) end @@ -164,9 +165,9 @@ function finish_mpi_receive!(mpi_cache::MPICache, mesh, equations, dg, cache) # Start receiving and unpack received data until all communication is finished data = MPI.Waitany(mpi_cache.mpi_recv_requests) while data !== nothing - recv_buffer = mpi_cache.mpi_recv_buffers[data] + recv_buffer = mpi_cache.mpi_recv_buffers.u[data] - for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces[data]) + for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces.u[data]) first = (index - 1) * data_size + 1 last = (index - 1) * data_size + data_size @@ -177,9 +178,9 @@ function finish_mpi_receive!(mpi_cache::MPICache, mesh, equations, dg, cache) end end - interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces[data]) * + interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces.u[data]) * data_size - for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars[data]) + for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars.u[data]) # First and last indices in the receive buffer for mortar data obtained from remote element # in a given position index_base = interfaces_data_size + (index - 1) * 4 * data_size @@ -431,17 +432,19 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mpi_mortars, mortar_ids = collect(1:nmpimortars(mpi_mortars))[p] # For each neighbor rank, init connectivity data structures - mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks)) - mpi_neighbor_mortars = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks)) + mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef, + length(mpi_neighbor_ranks))) + mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef, + length(mpi_neighbor_ranks))) for (index, rank) in enumerate(mpi_neighbor_ranks) - mpi_neighbor_interfaces[index] = interface_ids[findall(x -> (x == rank), - neighbor_ranks_interface)] - mpi_neighbor_mortars[index] = mortar_ids[findall(x -> (rank in x), - neighbor_ranks_mortar)] + mpi_neighbor_interfaces.u[index] = interface_ids[findall(x -> (x == rank), + neighbor_ranks_interface)] + mpi_neighbor_mortars.u[index] = mortar_ids[findall(x -> (rank in x), + neighbor_ranks_mortar)] end # Sanity checks that we counted all interfaces exactly once - @assert sum(length(v) for v in mpi_neighbor_interfaces) == + @assert sum(length(v) for v in mpi_neighbor_interfaces.u) == nmpiinterfaces(mpi_interfaces) return mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars diff --git a/src/solvers/dgsem_tree/dg_parallel.jl b/src/solvers/dgsem_tree/dg_parallel.jl index 3b33c9c33a5..4dbb4c0c91f 100644 --- a/src/solvers/dgsem_tree/dg_parallel.jl +++ b/src/solvers/dgsem_tree/dg_parallel.jl @@ -21,19 +21,21 @@ function init_mpi_data_structures(mpi_neighbor_interfaces, mpi_neighbor_mortars, nvars, n_nodes, uEltype) data_size = nvars * n_nodes^(n_dims - 1) n_small_elements = 2^(n_dims - 1) - mpi_send_buffers = Vector{Vector{uEltype}}(undef, length(mpi_neighbor_interfaces)) - mpi_recv_buffers = Vector{Vector{uEltype}}(undef, length(mpi_neighbor_interfaces)) + mpi_send_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, + length(mpi_neighbor_interfaces))) + mpi_recv_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, + length(mpi_neighbor_interfaces))) for index in 1:length(mpi_neighbor_interfaces) - mpi_send_buffers[index] = Vector{uEltype}(undef, - length(mpi_neighbor_interfaces[index]) * - data_size + - length(mpi_neighbor_mortars[index]) * - n_small_elements * 2 * data_size) - mpi_recv_buffers[index] = Vector{uEltype}(undef, - length(mpi_neighbor_interfaces[index]) * - data_size + - length(mpi_neighbor_mortars[index]) * - n_small_elements * 2 * data_size) + mpi_send_buffers.u[index] = Vector{uEltype}(undef, + length(mpi_neighbor_interfaces.u[index]) * + data_size + + length(mpi_neighbor_mortars.u[index]) * + n_small_elements * 2 * data_size) + mpi_recv_buffers.u[index] = Vector{uEltype}(undef, + length(mpi_neighbor_interfaces.u[index]) * + data_size + + length(mpi_neighbor_mortars.u[index]) * + n_small_elements * 2 * data_size) end mpi_send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))