diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 0f8ad475db8..344b8eacc3a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,3 +1,5 @@
+env:
+
 steps:
   - label: "CUDA Julia {{matrix.version}}"
     matrix:
@@ -7,12 +9,13 @@ steps:
     plugins:
       - JuliaCI/julia#v1:
           version: "{{matrix.version}}"
-    command: |
-      true
+      - JuliaCI/julia-test#v1: ~
+    env:
+      TRIXI_TEST: "CUDA"
     agents:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip ci\]/
     timeout_in_minutes: 60
     soft_fail:
-      - exit_status: 3
\ No newline at end of file
+      - exit_status: 3
diff --git a/.github/workflows/GPUCompat.yml b/.github/workflows/GPUCompat.yml
deleted file mode 100644
index 335e1c83c4c..00000000000
--- a/.github/workflows/GPUCompat.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-name: GPU Package Compatibility
-
-on:
-  pull_request:
-    paths-ignore:
-      - 'AUTHORS.md'
-      - 'CITATION.bib'
-      - 'CONTRIBUTING.md'
-      - 'LICENSE.md'
-      - 'NEWS.md'
-      - 'README.md'
-      - '.zenodo.json'
-      - '.github/workflows/benchmark.yml'
-      - '.github/workflows/CompatHelper.yml'
-      - '.github/workflows/TagBot.yml'
-      - 'benchmark/**'
-      - 'docs/**'
-      - 'utils/**'
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test:
-    if: "!contains(github.event.head_commit.message, 'skip ci')"
-    name: ${{ matrix.os }} - ${{ matrix.arch }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - version: '1.10'
-            os: ubuntu-latest
-            arch: x64
-          - version: '1.10'
-            os: windows-latest
-            arch: x64
-          # CUDA.jl only supports 64-bit Linux and Windows, see https://github.com/JuliaGPU/CUDA.jl?tab=readme-ov-file#requirements
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Julia
-        uses: julia-actions/setup-julia@v2
-        with:
-          version: ${{ matrix.version }}
-          arch: ${{ matrix.arch }}
-
-      - name: Display version info
-        run: julia -e 'using InteractiveUtils; versioninfo(verbose=true)'
-
-      - name: Cache Julia packages
-        uses: julia-actions/cache@v2
-
-      - name: Build project
-        uses: julia-actions/julia-buildpkg@v1
-
-      # Only CUDA.jl is needed for GPU compatibility test now
-      - name: Add CUDA.jl to environment
-        run: |
-          julia --project=. -e '
-          using Pkg; 
-          Pkg.activate(temp=true);
-          Pkg.develop(PackageSpec(path=pwd())); 
-          Pkg.add("CUDA");
-          Pkg.update()'
-
-    #   - name: Add Metal.jl to environment
-    #     run: |
-    #       julia --project=. -e '
-    #       using Pkg;
-    #       Pkg.activate(temp=true);
-    #       Pkg.develop(PackageSpec(path=pwd()));
-    #       Pkg.add("Metal");
-    #       Pkg.update()'
-
-    #   - name: Add AMDGPU.jl to environment
-    #     run: |
-    #       julia --project=. -e '
-    #       using Pkg;
-    #       Pkg.activate(temp=true);
-    #       Pkg.develop(PackageSpec(path=pwd()));
-    #       Pkg.add("AMDGPU");
-    #       Pkg.update()'
diff --git a/NEWS.md b/NEWS.md
index 72f572a74fc..c96b522da94 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,6 +5,13 @@ Trixi.jl follows the interpretation of
 used in the Julia ecosystem. Notable changes will be documented in this file
 for human readability.
 
+## Changes in the v0.12 lifecycle
+
+#### Added
+- Initial support for adapting data-structures between different storage arrays was added. This enables future work to support GPU with Trixi ([#2212]).
+
+#### Deprecated
+
 ## Changes when updating to v0.12 from v0.11.x
 
 #### Added
diff --git a/Project.toml b/Project.toml
index 83bfc35f982..72457d14258 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.12.7-DEV"
 
 [deps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -16,6 +17,7 @@ EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
@@ -52,24 +54,31 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 
 [extensions]
+TrixiAMDGPUExt = "AMDGPU"
+TrixiCUDAExt = "CUDA"
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
 
 [compat]
+AMDGPU = "1.3.5"
 Accessors = "0.1.36"
+Adapt = "4"
+CUDA = "5.8"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"
 DataStructures = "0.18.15"
 DelimitedFiles = "1"
-DiffEqBase = "6.154"
+DiffEqBase = "6.155.2"
 DiffEqCallbacks = "2.35, 3, 4"
 Downloads = "1.6"
 ECOS = "1.1.2"
@@ -77,6 +86,7 @@ EllipsisNotation = "1.0"
 FillArrays = "1.9"
 ForwardDiff = "0.10.36, 1"
 HDF5 = "0.16.10, 0.17"
+KernelAbstractions = "0.9.36"
 LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"
@@ -94,7 +104,7 @@ Printf = "1"
 RecipesBase = "1.3.4"
 RecursiveArrayTools = "3.31.1"
 Reexport = "1.2"
-Requires = "1.1"
+Requires = "1.3"
 SciMLBase = "2.67.0"
 SimpleUnPack = "1.1"
 SparseArrays = "1"
@@ -104,7 +114,7 @@ Static = "1.1.1"
 StaticArrayInterface = "1.5.1"
 StaticArrays = "1.9"
 StrideArrays = "0.1.29"
-StructArrays = "0.6.18, 0.7"
+StructArrays = "0.6.20, 0.7"
 SummationByPartsOperators = "0.5.52"
 T8code = "0.7.4"
 TimerOutputs = "0.5.23"
diff --git a/docs/Project.toml b/docs/Project.toml
index f2623d6c1ba..0b6b0cc7392 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Changelog = "5217a498-cd5d-4ec6-b8c2-9b85a09b6e3e"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
@@ -16,9 +17,13 @@ OrdinaryDiffEqSSPRK = "669c94d9-1f4b-4b64-b377-1aa079aa2388"
 OrdinaryDiffEqTsit5 = "b1df2697-797e-41e3-8120-5422d3b24e4a"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
 Trixi2Vtk = "bc1476a1-1ca6-4cc3-950b-c312b255ff95"
 TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
 
+[sources]
+Trixi = {path = ".."}
+
 [compat]
 CairoMakie = "0.12, 0.13, 0.14, 0.15"
 Changelog = "1.1"
diff --git a/docs/make.jl b/docs/make.jl
index 7111b66ab94..0301f5ba64e 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -163,7 +163,8 @@ makedocs(
                  "Style guide" => "styleguide.md",
                  "Testing" => "testing.md",
                  "Performance" => "performance.md",
-                 "Parallelization" => "parallelization.md"
+                 "Parallelization" => "parallelization.md",
+                 "Heterogeneous" => "heterogeneous.md"
              ],
              "Troubleshooting and FAQ" => "troubleshooting.md",
              "Reference" => [
diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md
new file mode 100644
index 00000000000..451945ab6cd
--- /dev/null
+++ b/docs/src/heterogeneous.md
@@ -0,0 +1,163 @@
+# Heterogeneous computing
+
+Support for heterogeneous computing is currently being worked on.
+
+## The use of Adapt.jl
+
+[Adapt.jl](https://github.com/JuliaGPU/Adapt.jl) is a package in the
+[JuliaGPU](https://github.com/JuliaGPU) family that allows for
+the translation of nested data structures. The primary goal is to allow the substitution of `Array` 
+at the storage level with a GPU array like `CuArray` from [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl).
+
+To facilitate this, data structures must be parameterized, so instead of:
+
+```julia
+struct Container <: Trixi.AbstractContainer
+   data::Array{Float64, 2}
+end
+```
+
+They must be written as:
+
+```jldoctest adapt; output = false, setup=:(import Trixi)
+struct Container{D<:AbstractArray} <: Trixi.AbstractContainer
+   data::D
+end
+
+# output
+
+```
+
+furthermore, we need to define a function that allows for the conversion of storage
+of our types: 
+
+```jldoctest adapt; output = false
+using Adapt
+
+function Adapt.adapt_structure(to, C::Container)
+    return Container(adapt(to, C.data))
+end
+
+# output
+
+```
+
+or simply
+
+```julia
+Adapt.@adapt_structure(Container)
+```
+
+additionally, we must define `Adapt.parent_type`.
+
+```jldoctest adapt; output = false
+function Adapt.parent_type(::Type{<:Container{D}}) where D
+    return D
+end
+
+# output
+
+```
+
+All together we can use this machinery to perform conversions of a container.
+
+```jldoctest adapt
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+Array
+```
+
+
+```julia-repl
+julia> using CUDA
+
+julia> GPU_C = adapt(CuArray, C)
+Container{CuArray{Float64, 1, CUDA.DeviceMemory}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+CuArray
+```
+
+## Element-type conversion with `Trixi.trixi_adapt`.
+
+We can use [`Trixi.trixi_adapt`](@ref) to perform both an element-type and a storage-type adoption:
+
+```jldoctest adapt
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(Array, Float32, C)
+Container{Vector{Float32}}(Float32[0.0, 0.0, 0.0])
+```
+
+```julia-repl
+julia> Trixi.trixi_adapt(CuArray, Float32, C)
+Container{CuArray{Float32, 1, CUDA.DeviceMemory}}(Float32[0.0, 0.0, 0.0])
+```
+
+!!! note
+    `adapt(Array{Float32}, C)` is tempting, but it will do the wrong thing
+    in the presence of `SVector`s and similar arrays from StaticArrays.jl.
+
+
+## Writing GPU kernels
+
+Offloading computations to the GPU is done with
+[KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl),
+allowing for vendor-agnostic GPU code.
+
+### Example
+
+Given the following Trixi.jl code, which would typically be called from within `rhs!`:
+
+```julia
+function trixi_rhs_fct(mesh, equations, solver, cache, args)
+    @threaded for element in eachelement(solver, cache)
+        # code
+    end
+end
+```
+
+1.  Put the inner code in a new function `rhs_fct_per_element`. Besides the index
+    `element`, pass all required fields as arguments, but make sure to `@unpack` them from
+    their structs in advance.
+
+2.  Where `trixi_rhs_fct` is called, get the backend, i.e., the hardware we are currently
+    running on via `trixi_backend(x)`.
+    This will, e.g., work with `u_ode`. Internally, KernelAbstractions.jl's `get_backend`
+    will be called, i.e., KernelAbstractions.jl has to know the type of `x`.
+
+    ```julia
+    backend = trixi_backend(u_ode)
+    ```
+
+3.  Add a new argument `backend` to `trixi_rhs_fct` used for dispatch.
+    When `backend` is `nothing`, the legacy implementation should be used:
+    ```julia
+    function trixi_rhs_fct(backend::Nothing, mesh, equations, solver, cache, args)
+        @unpack unpacked_args = cache
+        @threaded for element in eachelement(solver, cache)
+            rhs_fct_per_element(element, unpacked_args, args)
+        end
+    end
+    ```
+
+4.  When `backend` is a `Backend` (a type defined by KernelAbstractions.jl), write a
+    KernelAbstractions.jl kernel:
+    ```julia
+    function trixi_rhs_fct(backend::Backend, mesh, equations, solver, cache, args)
+        nelements(solver, cache) == 0 && return nothing  # return early when there are no elements
+        @unpack unpacked_args = cache
+        kernel! = rhs_fct_kernel!(backend)
+        kernel!(unpacked_args, args,
+                ndrange = nelements(solver, cache))
+        return nothing
+    end
+
+    @kernel function rhs_fct_kernel!(unpacked_args, args)
+        element = @index(Global)
+        rhs_fct_per_element(element, unpacked_args, args)
+    end
+    ```
\ No newline at end of file
diff --git a/docs/src/styleguide.md b/docs/src/styleguide.md
index 27210f951c0..07f2d90cddc 100644
--- a/docs/src/styleguide.md
+++ b/docs/src/styleguide.md
@@ -20,6 +20,7 @@ conventions, we apply and enforce automated source code formatting
     * The main modified argument comes first. For example, if the right-hand side `du` is modified,
       it should come first. If only the `cache` is modified, e.g., in `prolong2interfaces!`
       and its siblings, put the `cache` first.
+    * Some internal functions take a "computational backend" argument, this should always be passed as the first argument.
     * Otherwise, use the order `mesh, equations, solver, cache`.
     * If something needs to be specified in more detail for dispatch, put the additional argument before the general one
       that is specified in more detail. For example, we use `have_nonconservative_terms(equations), equations`
diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
new file mode 100644
index 00000000000..6f9e8e56986
--- /dev/null
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -0,0 +1,61 @@
+# The same setup as tree_2d_dgsem/elixir_advection_basic.jl
+# to verify GPU support and Adapt.jl support.
+
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7)
+equations = LinearScalarAdvectionEquation2D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0) # minimum coordinates (min(x), min(y))
+coordinates_max = (1.0, 1.0) # maximum coordinates (max(x), max(y))
+
+trees_per_dimension = (8, 8)
+
+# Create P4estMesh with 8 x 8 trees and 16 x 16 elements
+mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 initial_refinement_level = 1)
+
+# A semidiscretization collects data structures and functions for the spatial discretization
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
+                                    solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 1.0
+ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback = AnalysisCallback(semi, interval = 100)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.6)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, stepsize_callback)
+# TODO: GPU. The `analysis_callback` needs to be updated for GPU support
+# analysis_callback, save_solution, stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 1e-2, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/ext/TrixiAMDGPUExt.jl b/ext/TrixiAMDGPUExt.jl
new file mode 100644
index 00000000000..f3f37769fcd
--- /dev/null
+++ b/ext/TrixiAMDGPUExt.jl
@@ -0,0 +1,19 @@
+# Package extension for adding AMDGPU-based features to Trixi.jl
+module TrixiAMDGPUExt
+
+import AMDGPU: ROCArray
+import Trixi
+
+function Trixi.storage_type(::Type{<:ROCArray})
+    return ROCArray
+end
+
+function Trixi.unsafe_wrap_or_alloc(to::Type{<:ROCArray}, vector, size)
+    if length(vector) == 0
+        return similar(vector, size)
+    else
+        return unsafe_wrap(to, pointer(vector), size, lock = false)
+    end
+end
+
+end
diff --git a/ext/TrixiCUDAExt.jl b/ext/TrixiCUDAExt.jl
new file mode 100644
index 00000000000..681d2f53a1e
--- /dev/null
+++ b/ext/TrixiCUDAExt.jl
@@ -0,0 +1,11 @@
+# Package extension for adding CUDA-based features to Trixi.jl
+module TrixiCUDAExt
+
+import CUDA: CuArray
+import Trixi
+
+function Trixi.storage_type(::Type{<:CuArray})
+    return CuArray
+end
+
+end
diff --git a/src/Trixi.jl b/src/Trixi.jl
index a707437655e..dda4bc021f5 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -20,6 +20,7 @@ const _PREFERENCE_SQRT = @load_preference("sqrt", "sqrt_Trixi_NaN")
 const _PREFERENCE_LOG = @load_preference("log", "log_Trixi_NaN")
 const _PREFERENCE_POLYESTER = @load_preference("polyester", true)
 const _PREFERENCE_LOOPVECTORIZATION = @load_preference("loop_vectorization", true)
+const _PREFERENCE_USE_NATIVE_THREADING = @load_preference("native_threading", true)
 
 # Include other packages that are used in Trixi.jl
 # (standard library packages first, other packages next, all of them sorted alphabetically)
@@ -50,6 +51,7 @@ import SciMLBase: get_du, get_tmp_cache, u_modified!,
 
 using DelimitedFiles: readdlm
 using Downloads: Downloads
+using Adapt: Adapt, adapt
 using CodeTracking: CodeTracking
 using ConstructionBase: ConstructionBase
 using DiffEqBase: DiffEqBase, get_tstops, get_tstops_array
@@ -58,6 +60,7 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
+using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices
@@ -132,6 +135,7 @@ include("basic_types.jl")
 
 # Include all top-level source files
 include("auxiliary/auxiliary.jl")
+include("auxiliary/vector_of_arrays.jl")
 include("auxiliary/mpi.jl")
 include("auxiliary/p4est.jl")
 include("auxiliary/t8code.jl")
diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
index 90650f6abcf..aeeddcbe185 100644
--- a/src/auxiliary/containers.jl
+++ b/src/auxiliary/containers.jl
@@ -314,4 +314,135 @@ end
 function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
     raw_copy!(c, c, from, from, destination)
 end
+
+# Trixi storage types must implement these two Adapt.jl methods
+function Adapt.adapt_structure(to, c::AbstractContainer)
+    error("Interface: Must implement Adapt.adapt_structure(to, ::$(typeof(c)))")
+end
+
+function Adapt.parent_type(C::Type{<:AbstractContainer})
+    error("Interface: Must implement Adapt.parent_type(::Type{$C}")
+end
+
+function Adapt.unwrap_type(C::Type{<:AbstractContainer})
+    return Adapt.unwrap_type(Adapt.parent_type(C))
+end
+
+# TODO: Upstream to Adapt
+"""
+    storage_type(x)
+
+Return the storage type of `x`, which is a concrete array type, such as `Array`, `CuArray`, or `ROCArray`.
+"""
+function storage_type(x)
+    return storage_type(typeof(x))
+end
+
+function storage_type(T::Type)
+    error("Interface: Must implement storage_type(::Type{$T}")
+end
+
+function storage_type(::Type{<:Array})
+    Array
+end
+
+function storage_type(C::Type{<:AbstractContainer})
+    return storage_type(Adapt.unwrap_type(C))
+end
+
+# backend handling
+"""
+    trixi_backend(x)
+
+Return the computational backend for `x`, which is either a KernelAbstractions backend or `nothing`.
+If the backend is `nothing`, the default multi-threaded CPU backend is used.
+"""
+function trixi_backend(x)
+    # TODO: https://github.com/trixi-framework/Trixi.jl/pull/2417
+    if (_PREFERENCE_POLYESTER && LoopVectorization.check_args(x)) ||
+       (_PREFERENCE_USE_NATIVE_THREADING && get_backend(x) isa KernelAbstractions.CPU)
+        return nothing
+    end
+    return get_backend(x)
+end
+
+# TODO: After https://github.com/SciML/RecursiveArrayTools.jl/pull/455 we need to investigate the right way to handle StaticArray as uEltype for MultiDG.
+function trixi_backend(x::VectorOfArray)
+    u = parent(x)
+    # FIXME(vchuravy): This is a workaround because KA.get_backend is ambivalent of where a SArray is residing.
+    if eltype(u) <: StaticArrays.StaticArray
+        return nothing
+    end
+    if length(u) == 0
+        error("VectorOfArray is empty, cannot determine backend.")
+    end
+    # Use the backend of the first element in the parent array
+    return get_backend(u[1])
+end
+
+# For some storage backends like CUDA.jl, empty arrays do seem to simply be
+# null pointers which can cause `unsafe_wrap` to fail when calling
+# Adapt.adapt (ArgumentError, see
+# https://github.com/JuliaGPU/CUDA.jl/blob/v5.4.2/src/array.jl#L212-L229).
+# To circumvent this, on length zero arrays this allocates
+# a separate empty array instead of wrapping.
+# However, since zero length arrays are not used in calculations,
+# it should be okay if the underlying storage vectors and wrapped arrays
+# are not the same as long as they are properly wrapped when `resize!`d etc.
+function unsafe_wrap_or_alloc(to, vector, size)
+    if length(vector) == 0
+        return similar(vector, size)
+    else
+        return unsafe_wrap(to, pointer(vector), size)
+    end
+end
+
+struct TrixiAdaptor{Storage, RealT} end
+
+"""
+    trixi_adapt(Storage, RealT, x)
+
+Adapt `x` to the storage type `Storage` and real type `RealT`.
+"""
+function trixi_adapt(Storage, RealT, x)
+    adapt(TrixiAdaptor{Storage, RealT}(), x)
+end
+
+# Custom rules
+# 1. handling of StaticArrays
+function Adapt.adapt_storage(::TrixiAdaptor{<:Any, RealT},
+                             x::StaticArrays.StaticArray) where {RealT}
+    StaticArrays.similar_type(x, RealT)(x)
+end
+
+# 2. Handling of Arrays
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, RealT},
+                             x::AbstractArray{T}) where {Storage, RealT,
+                                                         T <: AbstractFloat}
+    adapt(Storage{RealT}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, RealT},
+                             x::AbstractArray{T}) where {Storage, RealT,
+                                                         T <: StaticArrays.StaticArray}
+    adapt(Storage{StaticArrays.similar_type(T, RealT)}, x)
+end
+
+# Our threaded cache contains MArray, it is unlikely that we would want to adapt those
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, RealT},
+                             x::Array{T}) where {Storage, RealT,
+                                                 T <: StaticArrays.MArray}
+    adapt(Array{StaticArrays.similar_type(T, RealT)}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, RealT},
+                             x::AbstractArray) where {Storage, RealT}
+    adapt(Storage, x)
+end
+
+# 3. TODO: Should we have a fallback? But that would imply implementing things for NamedTuple again
+
+function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage}
+    return unsafe_wrap_or_alloc(Storage, vec, size)
+end
 end # @muladd
diff --git a/src/auxiliary/vector_of_arrays.jl b/src/auxiliary/vector_of_arrays.jl
new file mode 100644
index 00000000000..0fa8dd7f1ec
--- /dev/null
+++ b/src/auxiliary/vector_of_arrays.jl
@@ -0,0 +1,31 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+# Wraps a Vector of Arrays, forwards `getindex` to the underlying Vector.
+# Implements `Adapt.adapt_structure` to allow offloading to the GPU which is
+# not possible for a plain Vector of Arrays.
+struct VecOfArrays{T <: AbstractArray}
+    arrays::Vector{T}
+end
+Base.getindex(v::VecOfArrays, i::Int) = Base.getindex(v.arrays, i)
+Base.IndexStyle(v::VecOfArrays) = Base.IndexStyle(v.arrays)
+Base.size(v::VecOfArrays) = Base.size(v.arrays)
+Base.length(v::VecOfArrays) = Base.length(v.arrays)
+Base.eltype(v::VecOfArrays{T}) where {T} = T
+function Adapt.adapt_structure(to, v::VecOfArrays)
+    return VecOfArrays([Adapt.adapt(to, arr) for arr in v.arrays])
+end
+function Adapt.parent_type(::Type{<:VecOfArrays{T}}) where {T}
+    return T
+end
+function Adapt.unwrap_type(A::Type{<:VecOfArrays})
+    Adapt.unwrap_type(Adapt.parent_type(A))
+end
+function Base.convert(::Type{<:VecOfArrays}, v::Vector{<:AbstractArray})
+    VecOfArrays(v)
+end
+end # @muladd
diff --git a/src/semidiscretization/semidiscretization.jl b/src/semidiscretization/semidiscretization.jl
index cc3900d42da..3f8823ea17c 100644
--- a/src/semidiscretization/semidiscretization.jl
+++ b/src/semidiscretization/semidiscretization.jl
@@ -82,9 +82,15 @@ end
 
 Wrap the semidiscretization `semi` as an ODE problem in the time interval `tspan`
 that can be passed to `solve` from the [SciML ecosystem](https://diffeq.sciml.ai/latest/).
+
+The optional keyword arguments `storage_type` and `real_type` configure the underlying computational
+datastructures. `storage_type` changes the fundamental array type being used, allowing the
+experimental use of `CuArray` or other GPU array types. `real_type` changes the computational data type being used.
 """
 function semidiscretize(semi::AbstractSemidiscretization, tspan;
-                        reset_threads = true)
+                        reset_threads = true,
+                        storage_type = nothing,
+                        real_type = nothing)
     # Optionally reset Polyester.jl threads. See
     # https://github.com/trixi-framework/Trixi.jl/issues/1583
     # https://github.com/JuliaSIMD/Polyester.jl/issues/30
@@ -92,6 +98,19 @@ function semidiscretize(semi::AbstractSemidiscretization, tspan;
         Polyester.reset_threads!()
     end
 
+    if !(storage_type === nothing && real_type === nothing)
+        if storage_type === nothing
+            storage_type = Array
+        end
+        if real_type === nothing
+            real_type = real(semi)
+        end
+        semi = trixi_adapt(storage_type, real_type, semi)
+        if eltype(tspan) !== real_type
+            tspan = convert.(real_type, tspan)
+        end
+    end
+
     u0_ode = compute_coefficients(first(tspan), semi)
     # TODO: MPI, do we want to synchronize loading and print debug statements, e.g. using
     #       mpi_isparallel() && MPI.Barrier(mpi_comm())
@@ -155,9 +174,10 @@ end
 Same as [`compute_coefficients`](@ref) but stores the result in `u_ode`.
 """
 function compute_coefficients!(u_ode, func, t, semi::AbstractSemidiscretization)
+    backend = trixi_backend(u_ode)
     u = wrap_array(u_ode, semi)
     # Call `compute_coefficients` defined by the solver
-    compute_coefficients!(u, func, t, mesh_equations_solver_cache(semi)...)
+    compute_coefficients!(backend, u, func, t, mesh_equations_solver_cache(semi)...)
 end
 
 """
diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl
index 7496a345661..2a563c02229 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic.jl
@@ -27,25 +27,6 @@ mutable struct SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
     solver::Solver
     cache::Cache
     performance_counter::PerformanceCounter
-
-    function SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
-                                          BoundaryConditions, SourceTerms, Solver,
-                                          Cache}(mesh::Mesh, equations::Equations,
-                                                 initial_condition::InitialCondition,
-                                                 boundary_conditions::BoundaryConditions,
-                                                 source_terms::SourceTerms,
-                                                 solver::Solver,
-                                                 cache::Cache) where {Mesh, Equations,
-                                                                      InitialCondition,
-                                                                      BoundaryConditions,
-                                                                      SourceTerms,
-                                                                      Solver,
-                                                                      Cache}
-        performance_counter = PerformanceCounter()
-
-        new(mesh, equations, initial_condition, boundary_conditions, source_terms,
-            solver, cache, performance_counter)
-    end
 end
 
 """
@@ -71,6 +52,8 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver
 
     check_periodicity_mesh_boundary_conditions(mesh, _boundary_conditions)
 
+    performance_counter = PerformanceCounter()
+
     SemidiscretizationHyperbolic{typeof(mesh), typeof(equations),
                                  typeof(initial_condition),
                                  typeof(_boundary_conditions), typeof(source_terms),
@@ -78,9 +61,13 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver
                                                                 initial_condition,
                                                                 _boundary_conditions,
                                                                 source_terms, solver,
-                                                                cache)
+                                                                cache,
+                                                                performance_counter)
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(SemidiscretizationHyperbolic)
+
 # Create a new semidiscretization but change some parameters compared to the input.
 # `Base.similar` follows a related concept but would require us to `copy` the `mesh`,
 # which would impact the performance. Instead, `SciMLBase.remake` has exactly the
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index ad211b3c003..3ed10ec2d46 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -415,6 +415,9 @@ struct DG{Basis, Mortar, SurfaceIntegral, VolumeIntegral}
     volume_integral::VolumeIntegral
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(DG)
+
 function Base.show(io::IO, dg::DG)
     @nospecialize dg # reduce precompilation time
 
@@ -639,8 +642,11 @@ include("fdsbp_unstructured/fdsbp.jl")
 function allocate_coefficients(mesh::AbstractMesh, equations, dg::DG, cache)
     # We must allocate a `Vector` in order to be able to `resize!` it (AMR).
     # cf. wrap_array
-    zeros(eltype(cache.elements),
-          nvariables(equations) * nnodes(dg)^ndims(mesh) * nelements(dg, cache))
+    u_ode = similar(cache.elements.node_coordinates, eltype(cache.elements),
+                    nvariables(equations) * nnodes(dg)^ndims(mesh) *
+                    nelements(dg, cache))
+    fill!(u_ode, zero(eltype(u_ode)))
+    return u_ode
 end
 
 @inline function wrap_array(u_ode::AbstractVector, mesh::AbstractMesh, equations,
@@ -683,7 +689,8 @@ end
         #  (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))..., nelements(dg, cache)))
     else
         # The following version is reasonably fast and allows us to `resize!(u_ode, ...)`.
-        unsafe_wrap(Array{eltype(u_ode), ndims(mesh) + 2}, pointer(u_ode),
+        ArrayType = Trixi.storage_type(u_ode)
+        unsafe_wrap(ArrayType{eltype(u_ode), ndims(mesh) + 2}, pointer(u_ode),
                     (nvariables(equations), ntuple(_ -> nnodes(dg), ndims(mesh))...,
                      nelements(dg, cache)))
     end
@@ -732,8 +739,8 @@ end
                  nelements(dg, cache)))
 end
 
-function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg::DG,
-                               cache)
+function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{1},
+                               equations, dg::DG, cache)
     @threaded for element in eachelement(dg, cache)
         for i in eachnode(dg)
             x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
@@ -753,20 +760,43 @@ function compute_coefficients!(u, func, t, mesh::AbstractMesh{1}, equations, dg:
     end
 end
 
-function compute_coefficients!(u, func, t, mesh::AbstractMesh{2}, equations, dg::DG,
-                               cache)
+function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{2},
+                               equations, dg::DG, cache)
+    @unpack node_coordinates = cache.elements
     @threaded for element in eachelement(dg, cache)
-        for j in eachnode(dg), i in eachnode(dg)
-            x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
-                                     j, element)
-            u_node = func(x_node, t, equations)
-            set_node_vars!(u, u_node, equations, dg, i, j, element)
-        end
+        compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
+                                      element)
+    end
+end
+
+function compute_coefficients!(backend::Backend, u, func, t, mesh::AbstractMesh{2},
+                               equations, dg::DG, cache)
+    nelements(dg, cache) == 0 && return nothing
+    @unpack node_coordinates = cache.elements
+    kernel! = compute_coefficients_kernel!(backend)
+    kernel!(u, func, t, equations, dg, node_coordinates,
+            ndrange = nelements(dg, cache))
+    return nothing
+end
+
+@kernel function compute_coefficients_kernel!(u, func, t, equations,
+                                              dg::DG, node_coordinates)
+    element = @index(Global)
+    compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element)
+end
+
+function compute_coefficients_element!(u, func, t, equations, dg::DG,
+                                       node_coordinates, element)
+    for j in eachnode(dg), i in eachnode(dg)
+        x_node = get_node_coords(node_coordinates, equations, dg, i,
+                                 j, element)
+        u_node = func(x_node, t, equations)
+        set_node_vars!(u, u_node, equations, dg, i, j, element)
     end
 end
 
-function compute_coefficients!(u, func, t, mesh::AbstractMesh{3}, equations, dg::DG,
-                               cache)
+function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{3},
+                               equations, dg::DG, cache)
     @threaded for element in eachelement(dg, cache)
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
diff --git a/src/solvers/dgmulti/dg.jl b/src/solvers/dgmulti/dg.jl
index b2a547ac64d..f5a995cba3f 100644
--- a/src/solvers/dgmulti/dg.jl
+++ b/src/solvers/dgmulti/dg.jl
@@ -209,7 +209,7 @@ function create_cache(mesh::DGMultiMesh{NDIMS}, equations, dg::DGMultiWeakForm,
             local_values_threaded, flux_threaded, rotated_flux_threaded)
 end
 
-function compute_coefficients!(u, initial_condition, t,
+function compute_coefficients!(::Nothing, u, initial_condition, t,
                                mesh::DGMultiMesh, equations, dg::DGMulti, cache)
     md = mesh.md
     rd = dg.basis
diff --git a/src/solvers/dgsem/basis_lobatto_legendre.jl b/src/solvers/dgsem/basis_lobatto_legendre.jl
index 777348aa8ce..9647f172e20 100644
--- a/src/solvers/dgsem/basis_lobatto_legendre.jl
+++ b/src/solvers/dgsem/basis_lobatto_legendre.jl
@@ -34,6 +34,32 @@ struct LobattoLegendreBasis{RealT <: Real, NNODES,
     # negative adjoint wrt the SBP dot product
 end
 
+function Adapt.adapt_structure(to, basis::LobattoLegendreBasis)
+    inverse_vandermonde_legendre = adapt(to, basis.inverse_vandermonde_legendre)
+    RealT = eltype(inverse_vandermonde_legendre)
+
+    nodes = SVector{<:Any, RealT}(basis.nodes)
+    weights = SVector{<:Any, RealT}(basis.weights)
+    inverse_weights = SVector{<:Any, RealT}(basis.inverse_weights)
+    boundary_interpolation = adapt(to, basis.boundary_interpolation)
+    derivative_matrix = adapt(to, basis.derivative_matrix)
+    derivative_split = adapt(to, basis.derivative_split)
+    derivative_split_transpose = adapt(to, basis.derivative_split_transpose)
+    derivative_dhat = adapt(to, basis.derivative_dhat)
+    return LobattoLegendreBasis{RealT, nnodes(basis), typeof(nodes),
+                                typeof(inverse_vandermonde_legendre),
+                                typeof(boundary_interpolation),
+                                typeof(derivative_matrix)}(nodes,
+                                                           weights,
+                                                           inverse_weights,
+                                                           inverse_vandermonde_legendre,
+                                                           boundary_interpolation,
+                                                           derivative_matrix,
+                                                           derivative_split,
+                                                           derivative_split_transpose,
+                                                           derivative_dhat)
+end
+
 function LobattoLegendreBasis(RealT, polydeg::Integer)
     nnodes_ = polydeg + 1
 
@@ -155,6 +181,17 @@ struct LobattoLegendreMortarL2{RealT <: Real, NNODES,
     reverse_lower::ReverseMatrix
 end
 
+function Adapt.adapt_structure(to, mortar::LobattoLegendreMortarL2)
+    forward_upper = adapt(to, mortar.forward_upper)
+    forward_lower = adapt(to, mortar.forward_lower)
+    reverse_upper = adapt(to, mortar.reverse_upper)
+    reverse_lower = adapt(to, mortar.reverse_lower)
+    return LobattoLegendreMortarL2{eltype(forward_upper), nnodes(mortar),
+                                   typeof(forward_upper),
+                                   typeof(reverse_upper)}(forward_upper, forward_lower,
+                                                          reverse_upper, reverse_lower)
+end
+
 function MortarL2(basis::LobattoLegendreBasis)
     RealT = real(basis)
     nnodes_ = nnodes(basis)
diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index a070db6b701..59942bc49a7 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -6,25 +6,31 @@
 #! format: noindent
 
 mutable struct P4estElementContainer{NDIMS, RealT <: Real, uEltype <: Real, NDIMSP1,
-                                     NDIMSP2, NDIMSP3} <: AbstractContainer
+                                     NDIMSP2, NDIMSP3,
+                                     ArrayNDIMSP1 <: DenseArray{RealT, NDIMSP1},
+                                     ArrayNDIMSP2 <: DenseArray{RealT, NDIMSP2},
+                                     ArrayNDIMSP3 <: DenseArray{RealT, NDIMSP3},
+                                     VectorRealT <: DenseVector{RealT},
+                                     VectoruEltype <: DenseVector{uEltype}} <:
+               AbstractContainer
     # Physical coordinates at each node
-    node_coordinates::Array{RealT, NDIMSP2}   # [orientation, node_i, node_j, node_k, element]
+    node_coordinates::ArrayNDIMSP2   # [orientation, node_i, node_j, node_k, element]
     # Jacobian matrix of the transformation
     # [jacobian_i, jacobian_j, node_i, node_j, node_k, element] where jacobian_i is the first index of the Jacobian matrix,...
-    jacobian_matrix::Array{RealT, NDIMSP3}
+    jacobian_matrix::ArrayNDIMSP3
     # Contravariant vectors, scaled by J, in Kopriva's blue book called Ja^i_n (i index, n dimension)
-    contravariant_vectors::Array{RealT, NDIMSP3}   # [dimension, index, node_i, node_j, node_k, element]
+    contravariant_vectors::ArrayNDIMSP3   # [dimension, index, node_i, node_j, node_k, element]
     # 1/J where J is the Jacobian determinant (determinant of Jacobian matrix)
-    inverse_jacobian::Array{RealT, NDIMSP1}   # [node_i, node_j, node_k, element]
+    inverse_jacobian::ArrayNDIMSP1   # [node_i, node_j, node_k, element]
     # Buffer for calculated surface flux
-    surface_flux_values::Array{uEltype, NDIMSP2} # [variable, i, j, direction, element]
+    surface_flux_values::ArrayNDIMSP2 # [variable, i, j, direction, element]
 
     # internal `resize!`able storage
-    _node_coordinates::Vector{RealT}
-    _jacobian_matrix::Vector{RealT}
-    _contravariant_vectors::Vector{RealT}
-    _inverse_jacobian::Vector{RealT}
-    _surface_flux_values::Vector{uEltype}
+    _node_coordinates::VectorRealT
+    _jacobian_matrix::VectorRealT
+    _contravariant_vectors::VectorRealT
+    _inverse_jacobian::VectorRealT
+    _surface_flux_values::VectoruEltype
 end
 
 @inline function nelements(elements::P4estElementContainer)
@@ -36,7 +42,7 @@ end
                                                                                     RealT,
                                                                                     uEltype
                                                                                     }
-    uEltype
+    return uEltype
 end
 
 # Only one-dimensional `Array`s are `resize!`able in Julia.
@@ -51,31 +57,41 @@ function Base.resize!(elements::P4estElementContainer, capacity)
     n_dims = ndims(elements)
     n_nodes = size(elements.node_coordinates, 2)
     n_variables = size(elements.surface_flux_values, 1)
+    ArrayType = storage_type(elements)
 
     resize!(_node_coordinates, n_dims * n_nodes^n_dims * capacity)
-    elements.node_coordinates = unsafe_wrap(Array, pointer(_node_coordinates),
-                                            (n_dims, ntuple(_ -> n_nodes, n_dims)...,
-                                             capacity))
+    elements.node_coordinates = unsafe_wrap_or_alloc(ArrayType,
+                                                     _node_coordinates,
+                                                     (n_dims,
+                                                      ntuple(_ -> n_nodes, n_dims)...,
+                                                      capacity))
 
     resize!(_jacobian_matrix, n_dims^2 * n_nodes^n_dims * capacity)
-    elements.jacobian_matrix = unsafe_wrap(Array, pointer(_jacobian_matrix),
-                                           (n_dims, n_dims,
-                                            ntuple(_ -> n_nodes, n_dims)..., capacity))
+    elements.jacobian_matrix = unsafe_wrap_or_alloc(ArrayType,
+                                                    _jacobian_matrix,
+                                                    (n_dims, n_dims,
+                                                     ntuple(_ -> n_nodes, n_dims)...,
+                                                     capacity))
 
     resize!(_contravariant_vectors, length(_jacobian_matrix))
-    elements.contravariant_vectors = unsafe_wrap(Array, pointer(_contravariant_vectors),
-                                                 size(elements.jacobian_matrix))
+    elements.contravariant_vectors = unsafe_wrap_or_alloc(ArrayType,
+                                                          _contravariant_vectors,
+                                                          size(elements.jacobian_matrix))
 
     resize!(_inverse_jacobian, n_nodes^n_dims * capacity)
-    elements.inverse_jacobian = unsafe_wrap(Array, pointer(_inverse_jacobian),
-                                            (ntuple(_ -> n_nodes, n_dims)..., capacity))
+    elements.inverse_jacobian = unsafe_wrap_or_alloc(ArrayType,
+                                                     _inverse_jacobian,
+                                                     (ntuple(_ -> n_nodes, n_dims)...,
+                                                      capacity))
 
     resize!(_surface_flux_values,
             n_variables * n_nodes^(n_dims - 1) * (n_dims * 2) * capacity)
-    elements.surface_flux_values = unsafe_wrap(Array, pointer(_surface_flux_values),
-                                               (n_variables,
-                                                ntuple(_ -> n_nodes, n_dims - 1)...,
-                                                n_dims * 2, capacity))
+    elements.surface_flux_values = unsafe_wrap_or_alloc(ArrayType,
+                                                        _surface_flux_values,
+                                                        (n_variables,
+                                                         ntuple(_ -> n_nodes,
+                                                                n_dims - 1)...,
+                                                         n_dims * 2, capacity))
 
     return nothing
 end
@@ -117,33 +133,104 @@ function init_elements(mesh::Union{P4estMesh{NDIMS, NDIMS, RealT},
                                        NDIMS * 2, nelements))
 
     elements = P4estElementContainer{NDIMS, RealT, uEltype, NDIMS + 1, NDIMS + 2,
-                                     NDIMS + 3}(node_coordinates, jacobian_matrix,
-                                                contravariant_vectors,
-                                                inverse_jacobian, surface_flux_values,
-                                                _node_coordinates, _jacobian_matrix,
-                                                _contravariant_vectors,
-                                                _inverse_jacobian, _surface_flux_values)
+                                     NDIMS + 3, Array{RealT, NDIMS + 1},
+                                     Array{RealT, NDIMS + 2}, Array{RealT, NDIMS + 3},
+                                     Vector{RealT}, Vector{uEltype}}(node_coordinates,
+                                                                     jacobian_matrix,
+                                                                     contravariant_vectors,
+                                                                     inverse_jacobian,
+                                                                     surface_flux_values,
+                                                                     _node_coordinates,
+                                                                     _jacobian_matrix,
+                                                                     _contravariant_vectors,
+                                                                     _inverse_jacobian,
+                                                                     _surface_flux_values)
 
     init_elements!(elements, mesh, basis)
     return elements
 end
 
-mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <:
+function Adapt.parent_type(::Type{<:P4estElementContainer{<:Any, <:Any, <:Any, <:Any,
+                                                          <:Any, <:Any, ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to,
+                               elements::P4estElementContainer{NDIMS}) where {NDIMS}
+    # Adapt underlying storage
+    _node_coordinates = adapt(to, elements._node_coordinates)
+    _jacobian_matrix = adapt(to, elements._jacobian_matrix)
+    _contravariant_vectors = adapt(to, elements._contravariant_vectors)
+    _inverse_jacobian = adapt(to, elements._inverse_jacobian)
+    _surface_flux_values = adapt(to, elements._surface_flux_values)
+
+    RealT = eltype(_inverse_jacobian)
+    uEltype = eltype(_surface_flux_values)
+
+    # Wrap arrays again
+    node_coordinates = unsafe_wrap_or_alloc(to, _node_coordinates,
+                                            size(elements.node_coordinates))
+    jacobian_matrix = unsafe_wrap_or_alloc(to, _jacobian_matrix,
+                                           size(elements.jacobian_matrix))
+    contravariant_vectors = unsafe_wrap_or_alloc(to, _contravariant_vectors,
+                                                 size(jacobian_matrix))
+    inverse_jacobian = unsafe_wrap_or_alloc(to, _inverse_jacobian,
+                                            size(elements.inverse_jacobian))
+    surface_flux_values = unsafe_wrap_or_alloc(to, _surface_flux_values,
+                                               size(elements.surface_flux_values))
+
+    new_type_params = (NDIMS,
+                       RealT,
+                       uEltype,
+                       NDIMS + 1,
+                       NDIMS + 2,
+                       NDIMS + 3,
+                       typeof(inverse_jacobian), # ArrayNDIMSP1
+                       typeof(node_coordinates), # ArrayNDIMSP2
+                       typeof(jacobian_matrix), # ArrayNDIMSP3
+                       typeof(_node_coordinates), # VectorRealT
+                       typeof(_surface_flux_values)) # VectoruEltype
+    return P4estElementContainer{new_type_params...}(node_coordinates,
+                                                     jacobian_matrix,
+                                                     contravariant_vectors,
+                                                     inverse_jacobian,
+                                                     surface_flux_values,
+                                                     _node_coordinates,
+                                                     _jacobian_matrix,
+                                                     _contravariant_vectors,
+                                                     _inverse_jacobian,
+                                                     _surface_flux_values)
+end
+
+mutable struct P4estInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2,
+                                       uArray <: DenseArray{uEltype, NDIMSP2},
+                                       IdsMatrix <: DenseMatrix{Int},
+                                       IndicesMatrix <:
+                                       DenseMatrix{NTuple{NDIMS, Symbol}},
+                                       uVector <: DenseVector{uEltype},
+                                       IdsVector <: DenseVector{Int},
+                                       IndicesVector <:
+                                       DenseVector{NTuple{NDIMS, Symbol}}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP2}       # [primary/secondary, variable, i, j, interface]
-    neighbor_ids::Matrix{Int}                   # [primary/secondary, interface]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [primary/secondary, interface]
+    u::uArray                   # [primary/secondary, variable, i, j, interface]
+    neighbor_ids::IdsMatrix     # [primary/secondary, interface]
+    node_indices::IndicesMatrix # [primary/secondary, interface]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
-    _neighbor_ids::Vector{Int}
-    _node_indices::Vector{NTuple{NDIMS, Symbol}}
+    _u::uVector
+    _neighbor_ids::IdsVector
+    _node_indices::IndicesVector
 end
 
 @inline function ninterfaces(interfaces::P4estInterfaceContainer)
     size(interfaces.neighbor_ids, 2)
 end
 @inline Base.ndims(::P4estInterfaceContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estInterfaceContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                               uEltype}
+    return uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(interfaces::P4estInterfaceContainer, capacity)
@@ -152,17 +239,20 @@ function Base.resize!(interfaces::P4estInterfaceContainer, capacity)
     n_dims = ndims(interfaces)
     n_nodes = size(interfaces.u, 3)
     n_variables = size(interfaces.u, 2)
+    ArrayType = storage_type(interfaces)
 
     resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity)
-    interfaces.u = unsafe_wrap(Array, pointer(_u),
+    interfaces.u = unsafe_wrap(ArrayType, pointer(_u),
                                (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                 capacity))
 
     resize!(_neighbor_ids, 2 * capacity)
-    interfaces.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids), (2, capacity))
+    interfaces.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids),
+                                          (2, capacity))
 
     resize!(_node_indices, 2 * capacity)
-    interfaces.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity))
+    interfaces.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices),
+                                          (2, capacity))
 
     return nothing
 end
@@ -189,10 +279,15 @@ function init_interfaces(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa
     _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_interfaces)
     node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_interfaces))
 
-    interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u, neighbor_ids,
-                                                                    node_indices,
-                                                                    _u, _neighbor_ids,
-                                                                    _node_indices)
+    interfaces = P4estInterfaceContainer{NDIMS, uEltype, NDIMS + 2,
+                                         typeof(u), typeof(neighbor_ids),
+                                         typeof(node_indices), typeof(_u),
+                                         typeof(_neighbor_ids), typeof(_node_indices)}(u,
+                                                                                       neighbor_ids,
+                                                                                       node_indices,
+                                                                                       _u,
+                                                                                       _neighbor_ids,
+                                                                                       _node_indices)
 
     init_interfaces!(interfaces, mesh)
 
@@ -205,21 +300,58 @@ function init_interfaces!(interfaces, mesh::Union{P4estMesh, P4estMeshView})
     return interfaces
 end
 
-mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1} <:
+function Adapt.parent_type(::Type{<:P4estInterfaceContainer{<:Any, <:Any, <:Any,
+                                                            ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, interfaces::P4estInterfaceContainer)
+    # Adapt underlying storage
+    _u = adapt(to, interfaces._u)
+    _neighbor_ids = adapt(to, interfaces._neighbor_ids)
+    _node_indices = adapt(to, interfaces._node_indices)
+    # Wrap arrays again
+    u = unsafe_wrap_or_alloc(to, _u, size(interfaces.u))
+    neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids,
+                                        size(interfaces.neighbor_ids))
+    node_indices = unsafe_wrap_or_alloc(to, _node_indices,
+                                        size(interfaces.node_indices))
+
+    NDIMS = ndims(interfaces)
+    new_type_params = (NDIMS,
+                       eltype(_u),
+                       NDIMS + 2,
+                       typeof(u), typeof(neighbor_ids), typeof(node_indices),
+                       typeof(_u), typeof(_neighbor_ids), typeof(_node_indices))
+    return P4estInterfaceContainer{new_type_params...}(u, neighbor_ids, node_indices,
+                                                       _u, _neighbor_ids, _node_indices)
+end
+
+mutable struct P4estBoundaryContainer{NDIMS, uEltype <: Real, NDIMSP1,
+                                      uArray <: DenseArray{uEltype, NDIMSP1},
+                                      IdsVector <: DenseVector{Int},
+                                      IndicesVector <:
+                                      DenseVector{NTuple{NDIMS, Symbol}},
+                                      uVector <: DenseVector{uEltype}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP1}       # [variables, i, j, boundary]
-    neighbor_ids::Vector{Int}                   # [boundary]
-    node_indices::Vector{NTuple{NDIMS, Symbol}} # [boundary]
-    name::Vector{Symbol}                # [boundary]
+    u::uArray                   # [variables, i, j, boundary]
+    neighbor_ids::IdsVector     # [boundary]
+    node_indices::IndicesVector # [boundary]
+    name::Vector{Symbol}        # [boundary]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
 end
 
 @inline function nboundaries(boundaries::P4estBoundaryContainer)
     length(boundaries.neighbor_ids)
 end
 @inline Base.ndims(::P4estBoundaryContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estBoundaryContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                              uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(boundaries::P4estBoundaryContainer, capacity)
@@ -228,9 +360,10 @@ function Base.resize!(boundaries::P4estBoundaryContainer, capacity)
     n_dims = ndims(boundaries)
     n_nodes = size(boundaries.u, 2)
     n_variables = size(boundaries.u, 1)
+    ArrayType = storage_type(boundaries)
 
     resize!(_u, n_variables * n_nodes^(n_dims - 1) * capacity)
-    boundaries.u = unsafe_wrap(Array, pointer(_u),
+    boundaries.u = unsafe_wrap(ArrayType, pointer(_u),
                                (n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                 capacity))
 
@@ -263,9 +396,11 @@ function init_boundaries(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equa
     node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, n_boundaries)
     names = Vector{Symbol}(undef, n_boundaries)
 
-    boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1}(u, neighbor_ids,
-                                                                   node_indices, names,
-                                                                   _u)
+    boundaries = P4estBoundaryContainer{NDIMS, uEltype, NDIMS + 1, typeof(u),
+                                        typeof(neighbor_ids), typeof(node_indices),
+                                        typeof(_u)}(u, neighbor_ids,
+                                                    node_indices, names,
+                                                    _u)
 
     if n_boundaries > 0
         init_boundaries!(boundaries, mesh)
@@ -312,6 +447,25 @@ function init_boundaries_iter_face_inner(info_pw, boundaries, boundary_id, mesh)
     return nothing
 end
 
+function Adapt.parent_type(::Type{<:P4estBoundaryContainer{<:Any, <:Any, <:Any, ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, boundaries::P4estBoundaryContainer)
+    _u = adapt(to, boundaries._u)
+    u = unsafe_wrap_or_alloc(to, _u, size(boundaries.u))
+    neighbor_ids = adapt(to, boundaries.neighbor_ids)
+    node_indices = adapt(to, boundaries.node_indices)
+    name = boundaries.name
+
+    NDIMS = ndims(boundaries)
+    return P4estBoundaryContainer{NDIMS, eltype(_u), NDIMS + 1, typeof(u),
+                                  typeof(neighbor_ids), typeof(node_indices),
+                                  typeof(_u)}(u, neighbor_ids, node_indices,
+                                              name, _u)
+end
+
 # Container data structure (structure-of-arrays style) for DG L2 mortars
 #
 # The positions used in `neighbor_ids` are 1:3 (in 2D) or 1:5 (in 3D), where 1:2 (in 2D)
@@ -337,20 +491,32 @@ end
 # │ └─────────────┴─────────────┘  └───────────────────────────┘
 # │
 # ⋅────> ξ
-mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3} <:
+mutable struct P4estMortarContainer{NDIMS, uEltype <: Real, NDIMSP1, NDIMSP3,
+                                    uArray <: DenseArray{uEltype, NDIMSP3},
+                                    IdsMatrix <: DenseMatrix{Int},
+                                    IndicesMatrix <:
+                                    DenseMatrix{NTuple{NDIMS, Symbol}},
+                                    uVector <: DenseVector{uEltype},
+                                    IdsVector <: DenseVector{Int},
+                                    IndicesVector <:
+                                    DenseVector{NTuple{NDIMS, Symbol}}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP3} # [small/large side, variable, position, i, j, mortar]
-    neighbor_ids::Matrix{Int}             # [position, mortar]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}} # [small/large, mortar]
+    u::uArray # [small/large side, variable, position, i, j, mortar]
+    neighbor_ids::IdsMatrix # [position, mortar]
+    node_indices::IndicesMatrix # [small/large, mortar]
 
     # internal `resize!`able storage
-    _u::Vector{uEltype}
-    _neighbor_ids::Vector{Int}
-    _node_indices::Vector{NTuple{NDIMS, Symbol}}
+    _u::uVector
+    _neighbor_ids::IdsVector
+    _node_indices::IndicesVector
 end
 
 @inline nmortars(mortars::P4estMortarContainer) = size(mortars.neighbor_ids, 2)
 @inline Base.ndims(::P4estMortarContainer{NDIMS}) where {NDIMS} = NDIMS
+@inline function Base.eltype(::P4estMortarContainer{NDIMS, uEltype}) where {NDIMS,
+                                                                            uEltype}
+    uEltype
+end
 
 # See explanation of Base.resize! for the element container
 function Base.resize!(mortars::P4estMortarContainer, capacity)
@@ -359,18 +525,19 @@ function Base.resize!(mortars::P4estMortarContainer, capacity)
     n_dims = ndims(mortars)
     n_nodes = size(mortars.u, 4)
     n_variables = size(mortars.u, 2)
+    ArrayType = storage_type(mortars)
 
     resize!(_u, 2 * n_variables * 2^(n_dims - 1) * n_nodes^(n_dims - 1) * capacity)
-    mortars.u = unsafe_wrap(Array, pointer(_u),
+    mortars.u = unsafe_wrap(ArrayType, pointer(_u),
                             (2, n_variables, 2^(n_dims - 1),
                              ntuple(_ -> n_nodes, n_dims - 1)..., capacity))
 
     resize!(_neighbor_ids, (2^(n_dims - 1) + 1) * capacity)
-    mortars.neighbor_ids = unsafe_wrap(Array, pointer(_neighbor_ids),
+    mortars.neighbor_ids = unsafe_wrap(ArrayType, pointer(_neighbor_ids),
                                        (2^(n_dims - 1) + 1, capacity))
 
     resize!(_node_indices, 2 * capacity)
-    mortars.node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, capacity))
+    mortars.node_indices = unsafe_wrap(ArrayType, pointer(_node_indices), (2, capacity))
 
     return nothing
 end
@@ -398,12 +565,15 @@ function init_mortars(mesh::Union{P4estMesh, P4estMeshView, T8codeMesh}, equatio
     _node_indices = Vector{NTuple{NDIMS, Symbol}}(undef, 2 * n_mortars)
     node_indices = unsafe_wrap(Array, pointer(_node_indices), (2, n_mortars))
 
-    mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3}(u,
-                                                                         neighbor_ids,
-                                                                         node_indices,
-                                                                         _u,
-                                                                         _neighbor_ids,
-                                                                         _node_indices)
+    mortars = P4estMortarContainer{NDIMS, uEltype, NDIMS + 1, NDIMS + 3, typeof(u),
+                                   typeof(neighbor_ids), typeof(node_indices),
+                                   typeof(_u), typeof(_neighbor_ids),
+                                   typeof(_node_indices)}(u,
+                                                          neighbor_ids,
+                                                          node_indices,
+                                                          _u,
+                                                          _neighbor_ids,
+                                                          _node_indices)
 
     if n_mortars > 0
         init_mortars!(mortars, mesh)
@@ -418,6 +588,34 @@ function init_mortars!(mortars, mesh::Union{P4estMesh, P4estMeshView})
     return mortars
 end
 
+function Adapt.parent_type(::Type{<:P4estMortarContainer{<:Any, <:Any, <:Any, <:Any,
+                                                         ArrayT}}) where {ArrayT}
+    ArrayT
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, mortars::P4estMortarContainer)
+    # Adapt underlying storage
+    _u = adapt(to, mortars._u)
+    _neighbor_ids = adapt(to, mortars._neighbor_ids)
+    _node_indices = adapt(to, mortars._node_indices)
+
+    # Wrap arrays again
+    u = unsafe_wrap_or_alloc(to, _u, size(mortars.u))
+    neighbor_ids = unsafe_wrap_or_alloc(to, _neighbor_ids, size(mortars.neighbor_ids))
+    node_indices = unsafe_wrap_or_alloc(to, _node_indices, size(mortars.node_indices))
+
+    NDIMS = ndims(mortars)
+    new_type_params = (NDIMS,
+                       eltype(_u),
+                       NDIMS + 1,
+                       NDIMS + 3,
+                       typeof(u), typeof(neighbor_ids), typeof(node_indices),
+                       typeof(_u), typeof(_neighbor_ids), typeof(_node_indices))
+    return P4estMortarContainer{new_type_params...}(u, neighbor_ids, node_indices,
+                                                    _u, _neighbor_ids, _node_indices)
+end
+
 function reinitialize_containers!(mesh::P4estMesh, equations, dg::DGSEM, cache)
     # Re-initialize elements container
     @unpack elements = cache
diff --git a/src/solvers/dgsem_p4est/containers_parallel.jl b/src/solvers/dgsem_p4est/containers_parallel.jl
index 676b37efff3..4b6fc813703 100644
--- a/src/solvers/dgsem_p4est/containers_parallel.jl
+++ b/src/solvers/dgsem_p4est/containers_parallel.jl
@@ -5,15 +5,19 @@
 @muladd begin
 #! format: noindent
 
-mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2} <:
+mutable struct P4estMPIInterfaceContainer{NDIMS, uEltype <: Real, NDIMSP2,
+                                          uArray <: DenseArray{uEltype, NDIMSP2},
+                                          VecInt <: DenseVector{Int},
+                                          IndicesVector <:
+                                          DenseVector{NTuple{NDIMS, Symbol}},
+                                          uVector <: DenseVector{uEltype}} <:
                AbstractContainer
-    u::Array{uEltype, NDIMSP2}                  # [primary/secondary, variable, i, j, interface]
-    local_neighbor_ids::Vector{Int}             # [interface]
-    node_indices::Vector{NTuple{NDIMS, Symbol}} # [interface]
-    local_sides::Vector{Int}                    # [interface]
-
+    u::uArray                   # [primary/secondary, variable, i, j, interface]
+    local_neighbor_ids::VecInt  # [interface]
+    node_indices::IndicesVector # [interface]
+    local_sides::VecInt         # [interface]
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
 end
 
 @inline function nmpiinterfaces(interfaces::P4estMPIInterfaceContainer)
@@ -27,9 +31,10 @@ function Base.resize!(mpi_interfaces::P4estMPIInterfaceContainer, capacity)
     n_dims = ndims(mpi_interfaces)
     n_nodes = size(mpi_interfaces.u, 3)
     n_variables = size(mpi_interfaces.u, 2)
+    ArrayType = storage_type(mpi_interfaces)
 
     resize!(_u, 2 * n_variables * n_nodes^(n_dims - 1) * capacity)
-    mpi_interfaces.u = unsafe_wrap(Array, pointer(_u),
+    mpi_interfaces.u = unsafe_wrap(ArrayType, pointer(_u),
                                    (2, n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
                                     capacity))
 
@@ -64,11 +69,13 @@ function init_mpi_interfaces(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh},
 
     local_sides = Vector{Int}(undef, n_mpi_interfaces)
 
-    mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2}(u,
-                                                                           local_neighbor_ids,
-                                                                           node_indices,
-                                                                           local_sides,
-                                                                           _u)
+    mpi_interfaces = P4estMPIInterfaceContainer{NDIMS, uEltype, NDIMS + 2,
+                                                typeof(u), typeof(local_neighbor_ids),
+                                                typeof(node_indices), typeof(_u)}(u,
+                                                                                  local_neighbor_ids,
+                                                                                  node_indices,
+                                                                                  local_sides,
+                                                                                  _u)
 
     init_mpi_interfaces!(mpi_interfaces, mesh)
 
@@ -81,6 +88,32 @@ function init_mpi_interfaces!(mpi_interfaces, mesh::ParallelP4estMesh)
     return mpi_interfaces
 end
 
+function Adapt.parent_type(::Type{<:Trixi.P4estMPIInterfaceContainer{<:Any, <:Any,
+                                                                     <:Any, A}}) where {A}
+    return A
+end
+
+# Manual adapt_structure since we have aliasing memory
+function Adapt.adapt_structure(to, mpi_interfaces::P4estMPIInterfaceContainer)
+    # Adapt Vectors and underlying storage
+    _u = adapt(to, mpi_interfaces._u)
+    local_neighbor_ids = adapt(to, mpi_interfaces.local_neighbor_ids)
+    node_indices = adapt(to, mpi_interfaces.node_indices)
+    local_sides = adapt(to, mpi_interfaces.local_sides)
+
+    # Wrap array again
+    u = unsafe_wrap_or_alloc(to, _u, size(mpi_interfaces.u))
+
+    NDIMS = ndims(mpi_interfaces)
+    return P4estMPIInterfaceContainer{NDIMS, eltype(u),
+                                      NDIMS + 2,
+                                      typeof(u), typeof(local_neighbor_ids),
+                                      typeof(node_indices), typeof(_u)}(u,
+                                                                        local_neighbor_ids,
+                                                                        node_indices,
+                                                                        local_sides, _u)
+end
+
 # Container data structure (structure-of-arrays style) for DG L2 mortars
 #
 # Similar to `P4estMortarContainer`. The field `neighbor_ids` has been split up into
@@ -88,14 +121,17 @@ end
 # available elements belonging to a particular MPI mortar. Furthermore, `normal_directions` holds
 # the normal vectors on the surface of the small elements for each mortar.
 mutable struct P4estMPIMortarContainer{NDIMS, uEltype <: Real, RealT <: Real, NDIMSP1,
-                                       NDIMSP2, NDIMSP3} <: AbstractContainer
-    u::Array{uEltype, NDIMSP3}                    # [small/large side, variable, position, i, j, mortar]
-    local_neighbor_ids::Vector{Vector{Int}}       # [mortar][ids]
-    local_neighbor_positions::Vector{Vector{Int}} # [mortar][positions]
-    node_indices::Matrix{NTuple{NDIMS, Symbol}}   # [small/large, mortar]
-    normal_directions::Array{RealT, NDIMSP2}      # [dimension, i, j, position, mortar]
+                                       NDIMSP2, NDIMSP3,
+                                       uArray <: DenseArray{uEltype, NDIMSP3},
+                                       uVector <: DenseVector{uEltype}} <:
+               AbstractContainer
+    u::uArray                                      # [small/large side, variable, position, i, j, mortar]
+    local_neighbor_ids::Vector{Vector{Int}}        # [mortar][ids]
+    local_neighbor_positions::Vector{Vector{Int}}  # [mortar][positions]
+    node_indices::Matrix{NTuple{NDIMS, Symbol}}    # [small/large, mortar]
+    normal_directions::Array{RealT, NDIMSP2}       # [dimension, i, j, position, mortar]
     # internal `resize!`able storage
-    _u::Vector{uEltype}
+    _u::uVector
     _node_indices::Vector{NTuple{NDIMS, Symbol}}
     _normal_directions::Vector{RealT}
 end
@@ -164,11 +200,12 @@ function init_mpi_mortars(mesh::Union{ParallelP4estMesh, ParallelT8codeMesh}, eq
                                      2^(NDIMS - 1), n_mpi_mortars))
 
     mpi_mortars = P4estMPIMortarContainer{NDIMS, uEltype, RealT, NDIMS + 1, NDIMS + 2,
-                                          NDIMS + 3}(u, local_neighbor_ids,
-                                                     local_neighbor_positions,
-                                                     node_indices, normal_directions,
-                                                     _u, _node_indices,
-                                                     _normal_directions)
+                                          NDIMS + 3, typeof(u),
+                                          typeof(_u)}(u, local_neighbor_ids,
+                                                      local_neighbor_positions,
+                                                      node_indices, normal_directions,
+                                                      _u, _node_indices,
+                                                      _normal_directions)
 
     if n_mpi_mortars > 0
         init_mpi_mortars!(mpi_mortars, mesh, basis, elements)
@@ -184,6 +221,35 @@ function init_mpi_mortars!(mpi_mortars, mesh::ParallelP4estMesh, basis, elements
     return mpi_mortars
 end
 
+function Adapt.adapt_structure(to, mpi_mortars::P4estMPIMortarContainer)
+    # TODO: GPU
+    # Only parts of this container are adapted, since we currently don't
+    # use `local_neighbor_ids`, `local_neighbor_positions`, `normal_directions`
+    # on the GPU. If we do need them we need to redesign this to use the VecOfArrays
+    # approach.
+
+    _u = adapt(to, mpi_mortars._u)
+    _node_indices = mpi_mortars._node_indices
+    _normal_directions = mpi_mortars._normal_directions
+
+    u = unsafe_wrap_or_alloc(to, _u, size(mpi_mortars.u))
+    local_neighbor_ids = mpi_mortars.local_neighbor_ids
+    local_neighbor_positions = mpi_mortars.local_neighbor_positions
+    node_indices = mpi_mortars.node_indices
+    normal_directions = mpi_mortars.normal_directions
+
+    NDIMS = ndims(mpi_mortars)
+    return P4estMPIMortarContainer{NDIMS, eltype(_u),
+                                   eltype(_normal_directions),
+                                   NDIMS + 1, NDIMS + 2, NDIMS + 3,
+                                   typeof(u), typeof(_u)}(u, local_neighbor_ids,
+                                                          local_neighbor_positions,
+                                                          node_indices,
+                                                          normal_directions, _u,
+                                                          _node_indices,
+                                                          _normal_directions)
+end
+
 # Overload init! function for regular interfaces, regular mortars and boundaries since they must
 # call the appropriate init_surfaces! function for parallel p4est meshes
 function init_interfaces!(interfaces, mesh::ParallelP4estMesh)
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index e59f502c86c..4c099c9fd3f 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -13,18 +13,18 @@ function create_cache(mesh::Union{P4estMesh{3}, T8codeMesh{3}}, equations,
     fstar_primary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
                                                 nnodes(mortar_l2),
                                                 nnodes(mortar_l2), 4)
-                              for _ in 1:Threads.nthreads()]
+                              for _ in 1:Threads.nthreads()] |> VecOfArrays
     fstar_secondary_threaded = [Array{uEltype, 4}(undef, nvariables(equations),
                                                   nnodes(mortar_l2),
                                                   nnodes(mortar_l2), 4)
-                                for _ in 1:Threads.nthreads()]
+                                for _ in 1:Threads.nthreads()] |> VecOfArrays
 
     fstar_tmp_threaded = [Array{uEltype, 3}(undef, nvariables(equations),
                                             nnodes(mortar_l2), nnodes(mortar_l2))
-                          for _ in 1:Threads.nthreads()]
+                          for _ in 1:Threads.nthreads()] |> VecOfArrays
     u_threaded = [Array{uEltype, 3}(undef, nvariables(equations), nnodes(mortar_l2),
                                     nnodes(mortar_l2))
-                  for _ in 1:Threads.nthreads()]
+                  for _ in 1:Threads.nthreads()] |> VecOfArrays
 
     (; fstar_primary_threaded, fstar_secondary_threaded, fstar_tmp_threaded, u_threaded)
 end
diff --git a/src/solvers/dgsem_p4est/dg_parallel.jl b/src/solvers/dgsem_p4est/dg_parallel.jl
index 2cc201dd1f0..7acddf07b4b 100644
--- a/src/solvers/dgsem_p4est/dg_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_parallel.jl
@@ -5,12 +5,13 @@
 @muladd begin
 #! format: noindent
 
-mutable struct P4estMPICache{uEltype}
+mutable struct P4estMPICache{BufferType <: DenseVector,
+                             VecInt <: DenseVector{<:Integer}}
     mpi_neighbor_ranks::Vector{Int}
-    mpi_neighbor_interfaces::Vector{Vector{Int}}
-    mpi_neighbor_mortars::Vector{Vector{Int}}
-    mpi_send_buffers::Vector{Vector{uEltype}}
-    mpi_recv_buffers::Vector{Vector{uEltype}}
+    mpi_neighbor_interfaces::VecOfArrays{VecInt}
+    mpi_neighbor_mortars::VecOfArrays{VecInt}
+    mpi_send_buffers::VecOfArrays{BufferType}
+    mpi_recv_buffers::VecOfArrays{BufferType}
     mpi_send_requests::Vector{MPI.Request}
     mpi_recv_requests::Vector{MPI.Request}
     n_elements_by_rank::OffsetArray{Int, 1, Array{Int, 1}}
@@ -25,25 +26,29 @@ function P4estMPICache(uEltype)
     end
 
     mpi_neighbor_ranks = Vector{Int}(undef, 0)
-    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0)
-    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0)
-    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0)
-    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0)
+    mpi_neighbor_interfaces = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
+    mpi_neighbor_mortars = Vector{Vector{Int}}(undef, 0) |> VecOfArrays
+    mpi_send_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
+    mpi_recv_buffers = Vector{Vector{uEltype}}(undef, 0) |> VecOfArrays
     mpi_send_requests = Vector{MPI.Request}(undef, 0)
     mpi_recv_requests = Vector{MPI.Request}(undef, 0)
     n_elements_by_rank = OffsetArray(Vector{Int}(undef, 0), 0:-1)
     n_elements_global = 0
     first_element_global_id = 0
 
-    P4estMPICache{uEltype}(mpi_neighbor_ranks, mpi_neighbor_interfaces,
-                           mpi_neighbor_mortars,
-                           mpi_send_buffers, mpi_recv_buffers,
-                           mpi_send_requests, mpi_recv_requests,
-                           n_elements_by_rank, n_elements_global,
-                           first_element_global_id)
+    P4estMPICache{Vector{uEltype}, Vector{Int}}(mpi_neighbor_ranks,
+                                                mpi_neighbor_interfaces,
+                                                mpi_neighbor_mortars,
+                                                mpi_send_buffers, mpi_recv_buffers,
+                                                mpi_send_requests, mpi_recv_requests,
+                                                n_elements_by_rank, n_elements_global,
+                                                first_element_global_id)
 end
 
-@inline Base.eltype(::P4estMPICache{uEltype}) where {uEltype} = uEltype
+@inline Base.eltype(::P4estMPICache{BufferType}) where {BufferType} = eltype(BufferType)
+
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(P4estMPICache)
 
 ##
 # Note that the code in `start_mpi_send`/`finish_mpi_receive!` is sensitive to inference on (at least) Julia 1.10.
@@ -265,16 +270,16 @@ end
 
 function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
                          mpi_interfaces, mpi_mortars, nvars, n_nodes, uEltype)
-    mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
-                                                                                                       mpi_mortars,
-                                                                                                       mesh)
+    mpi_neighbor_ranks, _mpi_neighbor_interfaces, _mpi_neighbor_mortars = init_mpi_neighbor_connectivity(mpi_interfaces,
+                                                                                                         mpi_mortars,
+                                                                                                         mesh)
 
-    mpi_send_buffers, mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(mpi_neighbor_interfaces,
-                                                                                                        mpi_neighbor_mortars,
-                                                                                                        ndims(mesh),
-                                                                                                        nvars,
-                                                                                                        n_nodes,
-                                                                                                        uEltype)
+    _mpi_send_buffers, _mpi_recv_buffers, mpi_send_requests, mpi_recv_requests = init_mpi_data_structures(_mpi_neighbor_interfaces,
+                                                                                                          _mpi_neighbor_mortars,
+                                                                                                          ndims(mesh),
+                                                                                                          nvars,
+                                                                                                          n_nodes,
+                                                                                                          uEltype)
 
     # Determine local and total number of elements
     n_elements_global = Int(mesh.p4est.global_num_quadrants[])
@@ -286,6 +291,11 @@ function init_mpi_cache!(mpi_cache::P4estMPICache, mesh::ParallelP4estMesh,
     first_element_global_id = Int(mesh.p4est.global_first_quadrant[mpi_rank() + 1]) + 1
     @assert n_elements_global==sum(n_elements_by_rank) "error in total number of elements"
 
+    mpi_neighbor_interfaces = VecOfArrays(_mpi_neighbor_interfaces)
+    mpi_neighbor_mortars = VecOfArrays(_mpi_neighbor_mortars)
+    mpi_send_buffers = VecOfArrays(_mpi_send_buffers)
+    mpi_recv_buffers = VecOfArrays(_mpi_recv_buffers)
+
     # TODO reuse existing structures
     @pack! mpi_cache = mpi_neighbor_ranks, mpi_neighbor_interfaces,
                        mpi_neighbor_mortars,
diff --git a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
index 0cb3bd7f409..d6cf6e1ce6d 100644
--- a/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
+++ b/src/solvers/dgsem_unstructured/sort_boundary_conditions.jl
@@ -13,9 +13,10 @@ It stores a set of global indices for each boundary condition type and name to e
 during the call to `calc_boundary_flux!`. The original dictionary form of the boundary conditions
 set by the user in the elixir file is also stored for printing.
 """
-mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any}}
+mutable struct UnstructuredSortedBoundaryTypes{N, BCs <: NTuple{N, Any},
+                                               Vec <: AbstractVector{<:Integer}}
     boundary_condition_types::BCs # specific boundary condition type(s), e.g. BoundaryConditionDirichlet
-    boundary_indices::NTuple{N, Vector{Int}} # integer vectors containing global boundary indices
+    boundary_indices::NTuple{N, Vec} # integer vectors containing global boundary indices
     boundary_dictionary::Dict{Symbol, Any} # boundary conditions as set by the user in the elixir file
     boundary_symbol_indices::Dict{Symbol, Vector{Int}} # integer vectors containing global boundary indices per boundary identifier
 end
@@ -33,10 +34,11 @@ function UnstructuredSortedBoundaryTypes(boundary_conditions::Dict, cache)
     boundary_symbol_indices = Dict{Symbol, Vector{Int}}()
 
     container = UnstructuredSortedBoundaryTypes{n_boundary_types,
-                                                typeof(boundary_condition_types)}(boundary_condition_types,
-                                                                                  boundary_indices,
-                                                                                  boundary_conditions,
-                                                                                  boundary_symbol_indices)
+                                                typeof(boundary_condition_types),
+                                                Vector{Int}}(boundary_condition_types,
+                                                             boundary_indices,
+                                                             boundary_conditions,
+                                                             boundary_symbol_indices)
 
     initialize!(container, cache)
 end
@@ -119,4 +121,7 @@ function initialize!(boundary_types_container::UnstructuredSortedBoundaryTypes{N
 
     return boundary_types_container
 end
+
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(UnstructuredSortedBoundaryTypes)
 end # @muladd
diff --git a/test/Project.toml b/test/Project.toml
index 3559f8cb6e2..d2d835746e1 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,8 +1,11 @@
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
@@ -29,10 +32,13 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TrixiTest = "0a316866-cbd0-4425-8bcb-08103b2c1f26"
 
 [compat]
+Adapt = "4"
 ADTypes = "1.11"
+AMDGPU = "1.3.5"
 Aqua = "0.8"
 CairoMakie = "0.12, 0.13, 0.14, 0.15"
 Convex = "0.16"
+CUDA = "5.8"
 DelimitedFiles = "1"
 DoubleFloats = "1.4.0"
 Downloads = "1"
diff --git a/test/runtests.jl b/test/runtests.jl
index db2c2e9dd88..d32eac318bb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -109,4 +109,22 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "paper_self_gravitating_gas_dynamics"
         include("test_paper_self_gravitating_gas_dynamics.jl")
     end
+
+    @time if TRIXI_TEST == "all" || TRIXI_TEST == "CUDA"
+        import CUDA
+        if CUDA.functional()
+            include("test_cuda.jl")
+        else
+            @warn "Unable to run CUDA tests on this machine"
+        end
+    end
+
+    @time if TRIXI_TEST == "all" || TRIXI_TEST == "AMDGPU"
+        import AMDGPU
+        if AMDGPU.functional()
+            include("test_amdgpu.jl")
+        else
+            @warn "Unable to run AMDGPU tests on this machine"
+        end
+    end
 end
diff --git a/test/test_amdgpu.jl b/test/test_amdgpu.jl
new file mode 100644
index 00000000000..6bbc9e41bec
--- /dev/null
+++ b/test/test_amdgpu.jl
@@ -0,0 +1,79 @@
+module TestAMDGPU
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=8.311947673061856e-6,
+                        linf=6.627000273229378e-5,)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    let
+        t = sol.t[end]
+        u_ode = sol.u[end]
+        du_ode = similar(u_ode)
+        @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    end
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / AMDGPU" begin
+    # Using AMDGPU inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using AMDGPU
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=nothing,   # TODO: GPU. [Float32(8.311947673061856e-6)],
+                        linf=nothing, # TODO: GPU. [Float32(6.627000273229378e-5)],
+                        RealT=Float32,
+                        real_type=Float32,
+                        storage_type=ROCArray,
+                        sol=nothing,) # TODO: GPU. Remove this once we can run the simulation on the GPU 
+    # # Ensure that we do not have excessive memory allocations
+    # # (e.g., from type instabilities)
+    # let
+    #     t = sol.t[end]
+    #     u_ode = sol.u[end]
+    #     du_ode = similar(u_ode)
+    #     @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    # end
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa ROCArray
+    @test ode.p.solver.basis.derivative_matrix isa ROCArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === ROCArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === ROCArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === ROCArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === ROCArray
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+
+end # module
diff --git a/test/test_aqua.jl b/test/test_aqua.jl
index 9b3f2d67903..154088995ca 100644
--- a/test/test_aqua.jl
+++ b/test/test_aqua.jl
@@ -10,6 +10,7 @@ include("test_trixi.jl")
 @timed_testset "Aqua.jl" begin
     Aqua.test_all(Trixi,
                   ambiguities = false,
+                  unbound_args = false, # FIXME: UnstructuredSortedBoundaryTypes
                   # exceptions necessary for adding a new method `StartUpDG.estimate_h`
                   # in src/solvers/dgmulti/sbp.jl
                   piracies = (treat_as_own = [Trixi.StartUpDG.RefElemData,
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
new file mode 100644
index 00000000000..4380ab0e111
--- /dev/null
+++ b/test/test_cuda.jl
@@ -0,0 +1,79 @@
+module TestCUDA
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=8.311947673061856e-6,
+                        linf=6.627000273229378e-5,)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    let
+        t = sol.t[end]
+        u_ode = sol.u[end]
+        du_ode = similar(u_ode)
+        @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    end
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / CUDA" begin
+    # Using CUDA inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using CUDA
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=nothing,   # TODO: GPU. [Float32(8.311947673061856e-6)],
+                        linf=nothing, # TODO: GPU. [Float32(6.627000273229378e-5)],
+                        RealT=Float32,
+                        real_type=Float32,
+                        storage_type=CuArray,
+                        sol=nothing,) # TODO: GPU. Remove this once we can run the simulation on the GPU
+    # # Ensure that we do not have excessive memory allocations
+    # # (e.g., from type instabilities)
+    # let
+    #     t = sol.t[end]
+    #     u_ode = sol.u[end]
+    #     du_ode = similar(u_ode)
+    #     @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    # end
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa CuArray
+    @test ode.p.solver.basis.derivative_matrix isa CuArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === CuArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === CuArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === CuArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === CuArray
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+
+end # module
diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl
index 8f903a849d2..04ebba71050 100644
--- a/test/test_p4est_2d.jl
+++ b/test/test_p4est_2d.jl
@@ -27,6 +27,34 @@ isdir(outdir) && rm(outdir, recursive = true)
         du_ode = similar(u_ode)
         @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
     end
+    semi32 = Trixi.trixi_adapt(Array, Float32, semi)
+    @test real(semi32.solver) == Float32
+    @test real(semi32.solver.basis) == Float32
+    @test real(semi32.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(semi32.mesh) == Float64
+end
+
+@trixi_testset "elixir_advection_basic.jl (Float32)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
+                        RealT=Float32,
+                        real_type=Float32)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    let
+        t = sol.t[end]
+        u_ode = sol.u[end]
+        du_ode = similar(u_ode)
+        @test_broken (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    end
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
 end
 
 @trixi_testset "elixir_advection_nonconforming_flag.jl" begin
diff --git a/test/test_unstructured_2d.jl b/test/test_unstructured_2d.jl
index d16bc96fb83..758e42b7da1 100644
--- a/test/test_unstructured_2d.jl
+++ b/test/test_unstructured_2d.jl
@@ -2,6 +2,7 @@ module TestExamplesUnstructuredMesh2D
 
 using Test
 using Trixi
+using Adapt
 
 include("test_trixi.jl")
 
@@ -32,6 +33,12 @@ isdir(outdir) && rm(outdir, recursive = true)
         du_ode = similar(u_ode)
         @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
     end
+    semi32 = Trixi.trixi_adapt(Array, Float32, semi)
+    @test real(semi32.solver) == Float32
+    @test real(semi32.solver.basis) == Float32
+    @test real(semi32.solver.mortar) == Float32
+    # TODO: remake ignores the mesh as well
+    @test real(semi32.mesh) == Float64
 end
 
 @trixi_testset "elixir_euler_free_stream.jl" begin