diff --git a/NEWS.md b/NEWS.md
index b87a369b042..cb012f8f6ea 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -115,7 +115,7 @@ for human readability.
 
 #### Changed
 
-- The numerical solution is wrapped in a `VectorOfArrays` from
+- The numerical solution is wrapped in a `VectorOfArray` from
   [RecursiveArrayTools.jl](https://github.com/SciML/RecursiveArrayTools.jl)
   for `DGMulti` solvers ([#2150]). You can use `Base.parent` to unwrap
   the original data.
diff --git a/Project.toml b/Project.toml
index 27f5bbca28e..f62f0d0c28c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -99,7 +99,7 @@ PrecompileTools = "1.2"
 Preferences = "1.4"
 Printf = "1"
 RecipesBase = "1.3.4"
-RecursiveArrayTools = "3.31.1"
+RecursiveArrayTools = "3.34.1"
 Reexport = "1.2"
 Requires = "1.3"
 SciMLBase = "2.67.0"
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 28d32535be7..0671d5aa8d9 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -134,7 +134,6 @@ include("basic_types.jl")
 
 # Include all top-level source files
 include("auxiliary/auxiliary.jl")
-include("auxiliary/vector_of_arrays.jl")
 include("auxiliary/mpi.jl")
 include("auxiliary/p4est.jl")
 include("auxiliary/t8code.jl")
diff --git a/src/auxiliary/vector_of_arrays.jl b/src/auxiliary/vector_of_arrays.jl
deleted file mode 100644
index 0fa8dd7f1ec..00000000000
--- a/src/auxiliary/vector_of_arrays.jl
+++ /dev/null
@@ -1,31 +0,0 @@
-# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
-# Since these FMAs can increase the performance of many numerical algorithms,
-# we need to opt-in explicitly.
-# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
-@muladd begin
-#! format: noindent
-
-# Wraps a Vector of Arrays, forwards `getindex` to the underlying Vector.
-# Implements `Adapt.adapt_structure` to allow offloading to the GPU which is
-# not possible for a plain Vector of Arrays.
-struct VecOfArrays{T <: AbstractArray}
-    arrays::Vector{T}
-end
-Base.getindex(v::VecOfArrays, i::Int) = Base.getindex(v.arrays, i)
-Base.IndexStyle(v::VecOfArrays) = Base.IndexStyle(v.arrays)
-Base.size(v::VecOfArrays) = Base.size(v.arrays)
-Base.length(v::VecOfArrays) = Base.length(v.arrays)
-Base.eltype(v::VecOfArrays{T}) where {T} = T
-function Adapt.adapt_structure(to, v::VecOfArrays)
-    return VecOfArrays([Adapt.adapt(to, arr) for arr in v.arrays])
-end
-function Adapt.parent_type(::Type{<:VecOfArrays{T}}) where {T}
-    return T
-end
-function Adapt.unwrap_type(A::Type{<:VecOfArrays})
-    Adapt.unwrap_type(Adapt.parent_type(A))
-end
-function Base.convert(::Type{<:VecOfArrays}, v::Vector{<:AbstractArray})
-    VecOfArrays(v)
-end
-end # @muladd
diff --git a/src/solvers/dgsem_p4est/containers_parallel.jl b/src/solvers/dgsem_p4est/containers_parallel.jl
index 4b6fc813703..bfcd64d9485 100644
--- a/src/solvers/dgsem_p4est/containers_parallel.jl
+++ b/src/solvers/dgsem_p4est/containers_parallel.jl
@@ -225,7 +225,7 @@ function Adapt.adapt_structure(to, mpi_mortars::P4estMPIMortarContainer)
     # TODO: GPU
     # Only parts of this container are adapted, since we currently don't
     # use `local_neighbor_ids`, `local_neighbor_positions`, `normal_directions`
-    # on the GPU. If we do need them we need to redesign this to use the VecOfArrays
+    # on the GPU. If we do need them we need to redesign this to use the VectorOfArray
     # approach.
 
     _u = adapt(to, mpi_mortars._u)
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index 4c099c9fd3f..9c59549bcb2 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -10,21 +10,24 @@
 function create_cache(mesh::Union{P4estMesh{3}, T8codeMesh{3}}, equations,
                       mortar_l2::LobattoLegendreMortarL2, uEltype)
     # TODO: Taal compare performance of different types
-    fstar_primary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
-                                                nnodes(mortar_l2),
-                                                nnodes(mortar_l2), 4)
-                              for _ in 1:Threads.nthreads()] |> VecOfArrays
-    fstar_secondary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
+    fstar_primary_threaded = VectorOfArray([Array{uEltype, 4}(undef,
+                                                              nvariables(equations),
+                                                              nnodes(mortar_l2),
+                                                              nnodes(mortar_l2), 4)
+                                            for _ in 1:Threads.nthreads()])
+    fstar_secondary_threaded = VectorOfArray([Array{uEltype, 4}(undef,
+                                                                nvariables(equations),
+                                                                nnodes(mortar_l2),
+                                                                nnodes(mortar_l2), 4)
+                                              for _ in 1:Threads.nthreads()])
+    fstar_tmp_threaded = VectorOfArray([Array{uEltype, 3}(undef, nvariables(equations),
+                                                          nnodes(mortar_l2),
+                                                          nnodes(mortar_l2))
+                                        for _ in 1:Threads.nthreads()])
+    u_threaded = VectorOfArray([Array{uEltype, 3}(undef, nvariables(equations),
                                                   nnodes(mortar_l2),
-                                                  nnodes(mortar_l2), 4)
-                                for _ in 1:Threads.nthreads()] |> VecOfArrays
-
-    fstar_tmp_threaded = [Array{uEltype, 3}(undef, nvariables(equations),
-                                            nnodes(mortar_l2), nnodes(mortar_l2))
-                          for _ in 1:Threads.nthreads()] |> VecOfArrays
-    u_threaded = [Array{uEltype, 3}(undef, nvariables(equations), nnodes(mortar_l2),
-                                    nnodes(mortar_l2))
-                  for _ in 1:Threads.nthreads()] |> VecOfArrays
+                                                  nnodes(mortar_l2))
+                                for _ in 1:Threads.nthreads()])
 
     (; fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded, u_threaded)
 end
@@ -519,9 +522,9 @@ function prolong2mortars!(cache, u,
 
         # Buffer to copy solution values of the large element in the correct orientation
         # before interpolating
-        u_buffer = cache.u_threaded[Threads.threadid()]
+        u_buffer = cache.u_threaded.u[Threads.threadid()]
         # temporary buffer for projections
-        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()]
 
         # Copy solution of large element face to buffer in the
         # correct orientation
@@ -590,9 +593,9 @@ function calc_mortar_flux!(surface_flux_values,
 
     @threaded for mortar in eachmortar(dg, cache)
         # Choose thread-specific pre-allocated container
-        fstar_primary = fstar_primary_threaded[Threads.threadid()]
-        fstar_secondary = fstar_secondary_threaded[Threads.threadid()]
-        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+        fstar_primary = fstar_primary_threaded.u[Threads.threadid()]
+        fstar_secondary = fstar_secondary_threaded.u[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()]
 
         # Get index information on the small elements
         small_indices = node_indices[1, mortar]
@@ -638,7 +641,7 @@ function calc_mortar_flux!(surface_flux_values,
 
         # Buffer to interpolate flux values of the large element to before
         # copying in the correct orientation
-        u_buffer = cache.u_threaded[Threads.threadid()]
+        u_buffer = cache.u_threaded.u[Threads.threadid()]
 
         # in calc_interface_flux!, the interface flux is computed once over each
         # interface using the normal from the "primary" element. The result is then
diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
index 1fe22e42fe7..06917ba17e3 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
@@ -697,10 +697,10 @@ function prolong2mortars_divergence!(cache, flux_viscous,
 
         # Buffer to copy solution values of the large element in the correct orientation
         # before interpolating
-        u_buffer = cache.u_threaded[Threads.threadid()]
+        u_buffer = cache.u_threaded.u[Threads.threadid()]
 
         # temporary buffer for projections
-        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()]
 
         # Copy solution of large element face to buffer in the
         # correct orientation
@@ -787,8 +787,8 @@ function calc_mortar_flux_divergence!(surface_flux_values,
 
     @threaded for mortar in eachmortar(dg, cache)
         # Choose thread-specific pre-allocated container
-        fstar = fstar_primary_threaded[Threads.threadid()]
-        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+        fstar = fstar_primary_threaded.u[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()]
 
         # Get index information on the small elements
         small_indices = node_indices[1, mortar]
@@ -831,7 +831,7 @@ function calc_mortar_flux_divergence!(surface_flux_values,
 
         # Buffer to interpolate flux values of the large element to before
         # copying in the correct orientation
-        u_buffer = cache.u_threaded[Threads.threadid()]
+        u_buffer = cache.u_threaded.u[Threads.threadid()]
 
         # this reuses the hyperbolic version of `mortar_fluxes_to_elements!`
         mortar_fluxes_to_elements!(surface_flux_values,
diff --git a/src/solvers/dgsem_p4est/dg_3d_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
index aad54f40afb..07a0a0ba1f6 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
@@ -335,9 +335,9 @@ function prolong2mpimortars!(cache, u,
             if position == 5 # -> large element
                 # Buffer to copy solution values of the large element in the correct orientation
                 # before interpolating
-                u_buffer = cache.u_threaded[Threads.threadid()]
+                u_buffer = cache.u_threaded.u[Threads.threadid()]
                 # temporary buffer for projections
-                fstar_tmp = cache.fstar_tmp_threaded[Threads.threadid()]
+                fstar_tmp = cache.fstar_tmp_threaded.u[Threads.threadid()]
 
                 i_large = i_large_start
                 j_large = j_large_start
@@ -423,9 +423,9 @@ function calc_mpi_mortar_flux!(surface_flux_values,
 
     @threaded for mortar in eachmpimortar(dg, cache)
         # Choose thread-specific pre-allocated container
-        fstar_primary = fstar_primary_threaded[Threads.threadid()]
-        fstar_secondary = fstar_secondary_threaded[Threads.threadid()]
-        fstar_tmp = fstar_tmp_threaded[Threads.threadid()]
+        fstar_primary = fstar_primary_threaded.u[Threads.threadid()]
+        fstar_secondary = fstar_secondary_threaded.u[Threads.threadid()]
+        fstar_tmp = fstar_tmp_threaded.u[Threads.threadid()]
 
         # Get index information on the small elements
         small_indices = node_indices[1, mortar]
@@ -465,7 +465,7 @@ function calc_mpi_mortar_flux!(surface_flux_values,
 
         # Buffer to interpolate flux values of the large element to before
         # copying in the correct orientation
-        u_buffer = cache.u_threaded[Threads.threadid()]
+        u_buffer = cache.u_threaded.u[Threads.threadid()]
 
         mpi_mortar_fluxes_to_elements!(surface_flux_values,
                                        mesh, equations, mortar_l2, dg, cache,
diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl
index 7acddf07b4b..a3a4005ad96 100644
--- a/src/solvers/dgsem_p4est/dg_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_parallel.jl
@@ -5,13 +5,12 @@
 @muladd begin
 #! format: noindent
 
-mutable struct P4estMPICache{BufferType <: DenseVector,
-                             VecInt <: DenseVector{<:Integer}}
+mutable struct P4estMPICache{uEltype}
     mpi_neighbor_ranks::Vector{Int}
-    mpi_neighbor_interfaces::VecOfArrays{VecInt}
-    mpi_neighbor_mortars::VecOfArrays{VecInt}
-    mpi_send_buffers::VecOfArrays{BufferType}
-    mpi_recv_buffers::VecOfArrays{BufferType}
+    mpi_neighbor_interfaces::VectorOfArray{Int, 2, Vector{Vector{Int}}}
+    mpi_neighbor_mortars::VectorOfArray{Int, 2, Vector{Vector{Int}}}
+    mpi_send_buffers::VectorOfArray{uEltype, 2, Vector{Vector{uEltype}}}
+    mpi_recv_buffers::VectorOfArray{uEltype, 2, Vector{Vector{uEltype}}}
     mpi_send_requests::Vector{MPI.Request}
     mpi_recv_requests::Vector{MPI.Request}
     n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}}
@@ -26,23 +25,23 @@ function P4estMPICache(uEltype)
     end
 
     mpi_neighbor_ranks = Vector{Int}(undef, 0)
-    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
-    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
-    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
-    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
+    mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef, 0))
+    mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef, 0))
+    mpi_send_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, 0))
+    mpi_recv_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, 0))
     mpi_send_requests = Vector{MPI.Request}(undef, 0)
     mpi_recv_requests = Vector{MPI.Request}(undef, 0)
     n_elements_by_rank = OffsetArray(Vector{Int}(undef, 0), 0:-1)
     n_elements_global = 0
     first_element_global_id = 0
 
-    P4estMPICache{Vector{uEltype}, Vector{Int}}(mpi_neighbor_ranks,
-                                                mpi_neighbor_interfaces,
-                                                mpi_neighbor_mortars,
-                                                mpi_send_buffers, mpi_recv_buffers,
-                                                mpi_send_requests, mpi_recv_requests,
-                                                n_elements_by_rank, n_elements_global,
-                                                first_element_global_id)
+    P4estMPICache{uEltype}(mpi_neighbor_ranks,
+                           mpi_neighbor_interfaces,
+                           mpi_neighbor_mortars,
+                           mpi_send_buffers, mpi_recv_buffers,
+                           mpi_send_requests, mpi_recv_requests,
+                           n_elements_by_rank, n_elements_global,
+                           first_element_global_id)
 end
 
 @inline Base.eltype(::P4estMPICache{BufferType}) where {BufferType} = eltype(BufferType)
@@ -65,9 +64,9 @@ function start_mpi_send!(mpi_cache::P4estMPICache, mesh, equations, dg, cache)
     n_small_elements = 2^(ndims(mesh) - 1)
 
     for rank in 1:length(mpi_cache.mpi_neighbor_ranks)
-        send_buffer = mpi_cache.mpi_send_buffers[rank]
+        send_buffer = mpi_cache.mpi_send_buffers.u[rank]
 
-        for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces[rank])
+        for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces.u[rank])
             first = (index - 1) * data_size + 1
             last = (index - 1) * data_size + data_size
             local_side = cache.mpi_interfaces.local_sides[interface]
@@ -77,15 +76,15 @@ function start_mpi_send!(mpi_cache::P4estMPICache, mesh, equations, dg, cache)
 
         # Set send_buffer corresponding to mortar data to NaN and overwrite the parts where local
         # data exists
-        interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces[rank]) *
+        interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces.u[rank]) *
                                data_size
-        mortars_data_size = length(mpi_cache.mpi_neighbor_mortars[rank]) *
+        mortars_data_size = length(mpi_cache.mpi_neighbor_mortars.u[rank]) *
                             n_small_elements * 2 * data_size
         # `NaN |> eltype(...)` ensures that the NaN's are of the appropriate floating point type
         send_buffer[(interfaces_data_size + 1):(interfaces_data_size + mortars_data_size)] .= NaN |>
                                                                                               eltype(mpi_cache)
 
-        for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars[rank])
+        for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars.u[rank])
             index_base = interfaces_data_size +
                          (index - 1) * n_small_elements * 2 * data_size
             indices = buffer_mortar_indices(mesh, index_base, data_size)
@@ -108,7 +107,7 @@ function start_mpi_send!(mpi_cache::P4estMPICache, mesh, equations, dg, cache)
 
     # Start sending
     for (index, rank) in enumerate(mpi_cache.mpi_neighbor_ranks)
-        mpi_cache.mpi_send_requests[index] = MPI.Isend(mpi_cache.mpi_send_buffers[index],
+        mpi_cache.mpi_send_requests[index] = MPI.Isend(mpi_cache.mpi_send_buffers.u[index],
                                                        rank, mpi_rank(), mpi_comm())
     end
 
@@ -117,7 +116,7 @@ end
 
 function start_mpi_receive!(mpi_cache::P4estMPICache)
     for (index, rank) in enumerate(mpi_cache.mpi_neighbor_ranks)
-        mpi_cache.mpi_recv_requests[index] = MPI.Irecv!(mpi_cache.mpi_recv_buffers[index],
+        mpi_cache.mpi_recv_requests[index] = MPI.Irecv!(mpi_cache.mpi_recv_buffers.u[index],
                                                         rank, rank, mpi_comm())
     end
 
@@ -136,9 +135,9 @@ function finish_mpi_receive!(mpi_cache::P4estMPICache, mesh, equations, dg, cach
     # Start receiving and unpack received data until all communication is finished
     data = MPI.Waitany(mpi_cache.mpi_recv_requests)
     while data !== nothing
-        recv_buffer = mpi_cache.mpi_recv_buffers[data]
+        recv_buffer = mpi_cache.mpi_recv_buffers.u[data]
 
-        for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces[data])
+        for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces.u[data])
             first = (index - 1) * data_size + 1
             last = (index - 1) * data_size + data_size
 
@@ -149,9 +148,9 @@ function finish_mpi_receive!(mpi_cache::P4estMPICache, mesh, equations, dg, cach
             end
         end
 
-        interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces[data]) *
+        interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces.u[data]) *
                                data_size
-        for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars[data])
+        for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars.u[data])
             index_base = interfaces_data_size +
                          (index - 1) * n_small_elements * 2 * data_size
             indices = buffer_mortar_indices(mesh, index_base, data_size)
@@ -270,16 +269,20 @@ end
 
 function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
                          mpi_interfaces, mpi_mortars, nvars, n_nodes, uEltype)
-    mpi_neighbor_ranks, _mpi_neighbor_interfaces, _mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
-                                                                                                         mpi_mortars,
-                                                                                                         mesh)
-
-    _mpi_send_buffers, _mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(_mpi_neighbor_interfaces,
-                                                                                                          _mpi_neighbor_mortars,
-                                                                                                          ndims(mesh),
-                                                                                                          nvars,
-                                                                                                          n_nodes,
-                                                                                                          uEltype)
+    mpi_neighbor_ranks,
+    mpi_neighbor_interfaces,
+    mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
+                                                          mpi_mortars,
+                                                          mesh)
+    mpi_send_buffers,
+    mpi_recv_buffers,
+    mpi_send_requests,
+    mpi_recv_requests = init_mpi_data_structures(mpi_neighbor_interfaces,
+                                                 mpi_neighbor_mortars,
+                                                 ndims(mesh),
+                                                 nvars,
+                                                 n_nodes,
+                                                 uEltype)
 
     # Determine local and total number of elements
     n_elements_global = Int(mesh.p4est.global_num_quadrants[])
@@ -291,11 +294,6 @@ function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
     first_element_global_id = Int(mesh.p4est.global_first_quadrant[mpi_rank() + 1]) + 1
     @assert n_elements_global==sum(n_elements_by_rank) "error in total number of elements"
 
-    mpi_neighbor_interfaces = VecOfArrays(_mpi_neighbor_interfaces)
-    mpi_neighbor_mortars = VecOfArrays(_mpi_neighbor_mortars)
-    mpi_send_buffers = VecOfArrays(_mpi_send_buffers)
-    mpi_recv_buffers = VecOfArrays(_mpi_recv_buffers)
-
     # TODO reuse existing structures
     @pack! mpi_cache = mpi_neighbor_ranks, mpi_neighbor_interfaces,
                        mpi_neighbor_mortars,
@@ -331,17 +329,19 @@ function init_mpi_neighbor_connectivity(mpi_interfaces, mpi_mortars,
     mortar_ids = collect(1:nmpimortars(mpi_mortars))[p]
 
     # For each neighbor rank, init connectivity data structures
-    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks))
-    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks))
+    mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef,
+                                                                length(mpi_neighbor_ranks)))
+    mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef,
+                                                             length(mpi_neighbor_ranks)))
     for (index, rank) in enumerate(mpi_neighbor_ranks)
-        mpi_neighbor_interfaces[index] = interface_ids[findall(==(rank),
-                                                               neighbor_ranks_interface)]
-        mpi_neighbor_mortars[index] = mortar_ids[findall(x -> (rank in x),
-                                                         neighbor_ranks_mortar)]
+        mpi_neighbor_interfaces.u[index] = interface_ids[findall(==(rank),
+                                                                 neighbor_ranks_interface)]
+        mpi_neighbor_mortars.u[index] = mortar_ids[findall(x -> (rank in x),
+                                                           neighbor_ranks_mortar)]
     end
 
     # Check that all interfaces were counted exactly once
-    @assert mapreduce(length, +, mpi_neighbor_interfaces; init = 0) ==
+    @assert mapreduce(length, +, mpi_neighbor_interfaces.u; init = 0) ==
             nmpiinterfaces(mpi_interfaces)
 
     return mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars
@@ -525,26 +525,26 @@ function exchange_normal_directions!(mpi_mortars, mpi_cache,
     data_size = n_nodes^(n_dims - 1) * n_dims
 
     # Create buffers and requests
-    send_buffers = Vector{Vector{RealT}}(undef, length(mpi_neighbor_mortars))
-    recv_buffers = Vector{Vector{RealT}}(undef, length(mpi_neighbor_mortars))
-    for index in 1:length(mpi_neighbor_mortars)
+    send_buffers = Vector{Vector{RealT}}(undef, length(mpi_neighbor_mortars.u))
+    recv_buffers = Vector{Vector{RealT}}(undef, length(mpi_neighbor_mortars.u))
+    for index in 1:length(mpi_neighbor_mortars.u)
         send_buffers[index] = Vector{RealT}(undef,
-                                            length(mpi_neighbor_mortars[index]) *
+                                            length(mpi_neighbor_mortars.u[index]) *
                                             n_small_elements * data_size)
         send_buffers[index] .= NaN |> RealT
         recv_buffers[index] = Vector{RealT}(undef,
-                                            length(mpi_neighbor_mortars[index]) *
+                                            length(mpi_neighbor_mortars.u[index]) *
                                             n_small_elements * data_size)
         recv_buffers[index] .= NaN |> RealT
     end
-    send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_mortars))
-    recv_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_mortars))
+    send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_mortars.u))
+    recv_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_mortars.u))
 
     # Fill send buffers
     for rank in 1:length(mpi_neighbor_ranks)
         send_buffer = send_buffers[rank]
 
-        for (index, mortar) in enumerate(mpi_neighbor_mortars[rank])
+        for (index, mortar) in enumerate(mpi_neighbor_mortars.u[rank])
             index_base = (index - 1) * n_small_elements * data_size
             indices = buffer_mortar_indices(mesh, index_base, data_size)
             for position in mpi_mortars.local_neighbor_positions[mortar]
@@ -571,7 +571,7 @@ function exchange_normal_directions!(mpi_mortars, mpi_cache,
     while data !== nothing
         recv_buffer = recv_buffers[data]
 
-        for (index, mortar) in enumerate(mpi_neighbor_mortars[data])
+        for (index, mortar) in enumerate(mpi_neighbor_mortars.u[data])
             index_base = (index - 1) * n_small_elements * data_size
             indices = buffer_mortar_indices(mesh, index_base, data_size)
             for position in 1:n_small_elements
diff --git a/src/solvers/dgsem_t8code/dg_parallel.jl b/src/solvers/dgsem_t8code/dg_parallel.jl
index 729d6ec0392..e4f6af2591b 100644
--- a/src/solvers/dgsem_t8code/dg_parallel.jl
+++ b/src/solvers/dgsem_t8code/dg_parallel.jl
@@ -116,17 +116,19 @@ function init_mpi_neighbor_connectivity(mpi_mesh_info, mesh::ParallelT8codeMesh)
     mortar_ids = collect(1:nmpimortars(mpi_mortars))[p]
 
     # For each neighbor rank, init connectivity data structures
-    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks))
-    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks))
+    mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef,
+                                                                length(mpi_neighbor_ranks)))
+    mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef,
+                                                             length(mpi_neighbor_ranks)))
     for (index, rank) in enumerate(mpi_neighbor_ranks)
-        mpi_neighbor_interfaces[index] = interface_ids[findall(==(rank),
-                                                               neighbor_ranks_interface)]
-        mpi_neighbor_mortars[index] = mortar_ids[findall(x -> (rank in x),
-                                                         neighbor_ranks_mortar)]
+        mpi_neighbor_interfaces.u[index] = interface_ids[findall(==(rank),
+                                                                 neighbor_ranks_interface)]
+        mpi_neighbor_mortars.u[index] = mortar_ids[findall(x -> (rank in x),
+                                                           neighbor_ranks_mortar)]
     end
 
     # Check that all interfaces were counted exactly once
-    @assert mapreduce(length, +, mpi_neighbor_interfaces; init = 0) ==
+    @assert mapreduce(length, +, mpi_neighbor_interfaces.u; init = 0) ==
             nmpiinterfaces(mpi_interfaces)
 
     return mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars
diff --git a/src/solvers/dgsem_tree/dg_2d_parallel.jl b/src/solvers/dgsem_tree/dg_2d_parallel.jl
index cb522aa3eaa..03274490ee0 100644
--- a/src/solvers/dgsem_tree/dg_2d_parallel.jl
+++ b/src/solvers/dgsem_tree/dg_2d_parallel.jl
@@ -11,10 +11,10 @@
 # TODO: MPI dimension agnostic
 mutable struct MPICache{uEltype <: Real}
     mpi_neighbor_ranks::Vector{Int}
-    mpi_neighbor_interfaces::Vector{Vector{Int}}
-    mpi_neighbor_mortars::Vector{Vector{Int}}
-    mpi_send_buffers::Vector{Vector{uEltype}}
-    mpi_recv_buffers::Vector{Vector{uEltype}}
+    mpi_neighbor_interfaces::VectorOfArray{Int, 2, Vector{Vector{Int}}}
+    mpi_neighbor_mortars::VectorOfArray{Int, 2, Vector{Vector{Int}}}
+    mpi_send_buffers::VectorOfArray{uEltype, 2, Vector{Vector{uEltype}}}
+    mpi_recv_buffers::VectorOfArray{uEltype, 2, Vector{Vector{uEltype}}}
     mpi_send_requests::Vector{MPI.Request}
     mpi_recv_requests::Vector{MPI.Request}
     n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}}
@@ -28,10 +28,10 @@ function MPICache(uEltype)
         throw(ArgumentError("MPICache only supports bitstypes, $uEltype is not a bitstype."))
     end
     mpi_neighbor_ranks = Vector{Int}(undef, 0)
-    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0)
-    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0)
-    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0)
-    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0)
+    mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef, 0))
+    mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef, 0))
+    mpi_send_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, 0))
+    mpi_recv_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef, 0))
     mpi_send_requests = Vector{MPI.Request}(undef, 0)
     mpi_recv_requests = Vector{MPI.Request}(undef, 0)
     n_elements_by_rank = OffsetArray(Vector{Int}(undef, 0), 0:-1)
@@ -49,7 +49,7 @@ end
 # TODO: MPI dimension agnostic
 function start_mpi_receive!(mpi_cache::MPICache)
     for (index, rank) in enumerate(mpi_cache.mpi_neighbor_ranks)
-        mpi_cache.mpi_recv_requests[index] = MPI.Irecv!(mpi_cache.mpi_recv_buffers[index],
+        mpi_cache.mpi_recv_requests[index] = MPI.Irecv!(mpi_cache.mpi_recv_buffers.u[index],
                                                         rank, rank, mpi_comm())
     end
 
@@ -61,9 +61,9 @@ function start_mpi_send!(mpi_cache::MPICache, mesh, equations, dg, cache)
     data_size = nvariables(equations) * nnodes(dg)^(ndims(mesh) - 1)
 
     for rank in 1:length(mpi_cache.mpi_neighbor_ranks)
-        send_buffer = mpi_cache.mpi_send_buffers[rank]
+        send_buffer = mpi_cache.mpi_send_buffers.u[rank]
 
-        for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces[rank])
+        for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces.u[rank])
             first = (index - 1) * data_size + 1
             last = (index - 1) * data_size + data_size
 
@@ -78,12 +78,13 @@ function start_mpi_send!(mpi_cache::MPICache, mesh, equations, dg, cache)
 
         # Each mortar has a total size of 4 * data_size, set everything to NaN first and overwrite the
         # parts where local data exists
-        interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces[rank]) *
+        interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces.u[rank]) *
                                data_size
-        mortars_data_size = length(mpi_cache.mpi_neighbor_mortars[rank]) * 4 * data_size
+        mortars_data_size = length(mpi_cache.mpi_neighbor_mortars.u[rank]) * 4 *
+                            data_size
         send_buffer[(interfaces_data_size + 1):(interfaces_data_size + mortars_data_size)] .= NaN
 
-        for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars[rank])
+        for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars.u[rank])
             # First and last indices in the send buffer for mortar data obtained from local element
             # in a given position
             index_base = interfaces_data_size + (index - 1) * 4 * data_size
@@ -145,7 +146,7 @@ function start_mpi_send!(mpi_cache::MPICache, mesh, equations, dg, cache)
 
     # Start sending
     for (index, rank) in enumerate(mpi_cache.mpi_neighbor_ranks)
-        mpi_cache.mpi_send_requests[index] = MPI.Isend(mpi_cache.mpi_send_buffers[index],
+        mpi_cache.mpi_send_requests[index] = MPI.Isend(mpi_cache.mpi_send_buffers.u[index],
                                                        rank, mpi_rank(), mpi_comm())
     end
 
@@ -164,9 +165,9 @@ function finish_mpi_receive!(mpi_cache::MPICache, mesh, equations, dg, cache)
     # Start receiving and unpack received data until all communication is finished
     data = MPI.Waitany(mpi_cache.mpi_recv_requests)
     while data !== nothing
-        recv_buffer = mpi_cache.mpi_recv_buffers[data]
+        recv_buffer = mpi_cache.mpi_recv_buffers.u[data]
 
-        for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces[data])
+        for (index, interface) in enumerate(mpi_cache.mpi_neighbor_interfaces.u[data])
             first = (index - 1) * data_size + 1
             last = (index - 1) * data_size + data_size
 
@@ -177,9 +178,9 @@ function finish_mpi_receive!(mpi_cache::MPICache, mesh, equations, dg, cache)
             end
         end
 
-        interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces[data]) *
+        interfaces_data_size = length(mpi_cache.mpi_neighbor_interfaces.u[data]) *
                                data_size
-        for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars[data])
+        for (index, mortar) in enumerate(mpi_cache.mpi_neighbor_mortars.u[data])
             # First and last indices in the receive buffer for mortar data obtained from remote element
             # in a given position
             index_base = interfaces_data_size + (index - 1) * 4 * data_size
@@ -431,17 +432,19 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mpi_mortars,
     mortar_ids = collect(1:nmpimortars(mpi_mortars))[p]
 
     # For each neighbor rank, init connectivity data structures
-    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks))
-    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, length(mpi_neighbor_ranks))
+    mpi_neighbor_interfaces = VectorOfArray(Vector{Vector{Int}}(undef,
+                                                                length(mpi_neighbor_ranks)))
+    mpi_neighbor_mortars = VectorOfArray(Vector{Vector{Int}}(undef,
+                                                             length(mpi_neighbor_ranks)))
     for (index, rank) in enumerate(mpi_neighbor_ranks)
-        mpi_neighbor_interfaces[index] = interface_ids[findall(x -> (x == rank),
-                                                               neighbor_ranks_interface)]
-        mpi_neighbor_mortars[index] = mortar_ids[findall(x -> (rank in x),
-                                                         neighbor_ranks_mortar)]
+        mpi_neighbor_interfaces.u[index] = interface_ids[findall(x -> (x == rank),
+                                                                 neighbor_ranks_interface)]
+        mpi_neighbor_mortars.u[index] = mortar_ids[findall(x -> (rank in x),
+                                                           neighbor_ranks_mortar)]
     end
 
     # Sanity checks that we counted all interfaces exactly once
-    @assert sum(length(v) for v in mpi_neighbor_interfaces) ==
+    @assert sum(length(v) for v in mpi_neighbor_interfaces.u) ==
             nmpiinterfaces(mpi_interfaces)
 
     return mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars
diff --git a/src/solvers/dgsem_tree/dg_parallel.jl b/src/solvers/dgsem_tree/dg_parallel.jl
index 3b33c9c33a5..4dbb4c0c91f 100644
--- a/src/solvers/dgsem_tree/dg_parallel.jl
+++ b/src/solvers/dgsem_tree/dg_parallel.jl
@@ -21,19 +21,21 @@ function init_mpi_data_structures(mpi_neighbor_interfaces, mpi_neighbor_mortars,
                                   nvars, n_nodes, uEltype)
     data_size = nvars * n_nodes^(n_dims - 1)
     n_small_elements = 2^(n_dims - 1)
-    mpi_send_buffers = Vector{Vector{uEltype}}(undef, length(mpi_neighbor_interfaces))
-    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, length(mpi_neighbor_interfaces))
+    mpi_send_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef,
+                                                             length(mpi_neighbor_interfaces)))
+    mpi_recv_buffers = VectorOfArray(Vector{Vector{uEltype}}(undef,
+                                                             length(mpi_neighbor_interfaces)))
     for index in 1:length(mpi_neighbor_interfaces)
-        mpi_send_buffers[index] = Vector{uEltype}(undef,
-                                                  length(mpi_neighbor_interfaces[index]) *
-                                                  data_size +
-                                                  length(mpi_neighbor_mortars[index]) *
-                                                  n_small_elements * 2 * data_size)
-        mpi_recv_buffers[index] = Vector{uEltype}(undef,
-                                                  length(mpi_neighbor_interfaces[index]) *
-                                                  data_size +
-                                                  length(mpi_neighbor_mortars[index]) *
-                                                  n_small_elements * 2 * data_size)
+        mpi_send_buffers.u[index] = Vector{uEltype}(undef,
+                                                    length(mpi_neighbor_interfaces.u[index]) *
+                                                    data_size +
+                                                    length(mpi_neighbor_mortars.u[index]) *
+                                                    n_small_elements * 2 * data_size)
+        mpi_recv_buffers.u[index] = Vector{uEltype}(undef,
+                                                    length(mpi_neighbor_interfaces.u[index]) *
+                                                    data_size +
+                                                    length(mpi_neighbor_mortars.u[index]) *
+                                                    n_small_elements * 2 * data_size)
     end
 
     mpi_send_requests = Vector{MPI.Request}(undef, length(mpi_neighbor_interfaces))