diff --git a/examples/Project.toml b/examples/Project.toml
new file mode 100644
index 00000000000..97780005dc5
--- /dev/null
+++ b/examples/Project.toml
@@ -0,0 +1,5 @@
+[deps]
+OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
+OrdinaryDiffEqSSPRK = "669c94d9-1f4b-4b64-b377-1aa079aa2388"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
diff --git a/examples/p4est_2d_dgsem/Project.toml b/examples/p4est_2d_dgsem/Project.toml
new file mode 100644
index 00000000000..6b8f4dc58c1
--- /dev/null
+++ b/examples/p4est_2d_dgsem/Project.toml
@@ -0,0 +1,9 @@
+[deps]
+AbbreviatedStackTraces = "ac637c84-cc71-43bf-9c33-c1b4316be3d4"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Infiltrator = "5903a43b-9cc3-4c30-8d17-598619ec4e9b"
+OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
+Trixi2Vtk = "bc1476a1-1ca6-4cc3-950b-c312b255ff95"
diff --git a/examples/p4est_2d_dgsem/elixir_advection_coupled.jl b/examples/p4est_2d_dgsem/elixir_advection_coupled.jl
new file mode 100644
index 00000000000..9762eb96fd7
--- /dev/null
+++ b/examples/p4est_2d_dgsem/elixir_advection_coupled.jl
@@ -0,0 +1,94 @@
+using OrdinaryDiffEqSSPRK, OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# Simplest coupled setup consisting of two non-trivial mesh views.
+
+advection_velocity = (0.2, -0.7)
+equations = LinearScalarAdvectionEquation2D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0) # minimum coordinates (min(x), min(y))
+coordinates_max = (1.0, 1.0) # maximum coordinates (max(x), max(y))
+
+trees_per_dimension = (8, 8)
+
+# Create parent P4estMesh with 8 x 8 trees and 8 x 8 elements
+# Since we couple through the boundaries, the periodicity does not matter here,
+# but it is to trigger parts of the code for the test.
+parent_mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                        coordinates_min = coordinates_min,
+                        coordinates_max = coordinates_max,
+                        initial_refinement_level = 0,
+                        periodicity = false)
+
+# Define the mesh views consisting of a small square in the center
+# and a square ring around it.
+cell_ids1 = vcat((1:18), (23:26), (31:34), (39:42), (47:64))
+mesh1 = P4estMeshView(parent_mesh, cell_ids1)
+cell_ids2 = vcat((19:22), (27:30), (35:38), (43:46))
+mesh2 = P4estMeshView(parent_mesh, cell_ids2)
+
+# Define a trivial coupling function.
+coupling_function = (x, u, equations_other, equations_own) -> u
+
+# Define a coupling function for each combination of interfaces.
+coupling_functions = Array{Function}(undef, 2, 2)
+coupling_functions[1, 1] = (x, u, equations_other, equations_own) -> u
+coupling_functions[1, 2] = (x, u, equations_other, equations_own) -> u
+coupling_functions[2, 1] = (x, u, equations_other, equations_own) -> u
+coupling_functions[2, 2] = (x, u, equations_other, equations_own) -> u
+
+# The mesh is coupled across the physical boundaries, which makes this setup
+# effectively double periodic.
+boundary_conditions = Dict(:x_neg => BoundaryConditionCoupledP4est(coupling_functions),
+                           :y_neg => BoundaryConditionCoupledP4est(coupling_functions),
+                           :y_pos => BoundaryConditionCoupledP4est(coupling_functions),
+                           :x_pos => BoundaryConditionCoupledP4est(coupling_functions))
+
+semi1 = SemidiscretizationHyperbolic(mesh1, equations, initial_condition_convergence_test,
+                                     solver,
+                                     boundary_conditions = boundary_conditions)
+semi2 = SemidiscretizationHyperbolic(mesh2, equations, initial_condition_convergence_test,
+                                     solver,
+                                     boundary_conditions = boundary_conditions)
+
+# Create a semidiscretization that bundles semi1 and semi2
+semi = SemidiscretizationCoupledP4est(semi1, semi2)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 2.0
+ode = semidiscretize(semi, (0.0, 2.0))
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback1 = AnalysisCallback(semi1, interval = 100)
+analysis_callback2 = AnalysisCallback(semi2, interval = 100)
+analysis_callback = AnalysisCallbackCoupledP4est(semi, analysis_callback1,
+                                                 analysis_callback2)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.6)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, analysis_callback, save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/examples/p4est_2d_dgsem/elixir_euler_mhd_coupled.jl b/examples/p4est_2d_dgsem/elixir_euler_mhd_coupled.jl
new file mode 100644
index 00000000000..a69ded602d0
--- /dev/null
+++ b/examples/p4est_2d_dgsem/elixir_euler_mhd_coupled.jl
@@ -0,0 +1,126 @@
+using OrdinaryDiffEqSSPRK, OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# One Euler system is coupled to one MHD system.
+
+# Pressure wave, same for the Euler system.
+function initial_condition_mhd(x, t, equations::IdealGlmMhdEquations2D)
+    rho = ((1.0 + 0.01 * sin(x[1] * 2 * pi)))
+    v1 = ((0.01 * sin((x[1] - 1 / 2) * 2 * pi)))
+    v2 = 0.0
+    v3 = 0.0
+    p = rho^equations.gamma
+    B1 = 0.0
+    B2 = 0.0
+    B3 = 0.0
+    psi = 0.0
+
+    return prim2cons(SVector(rho, v1, v2, v3, p, B1, B2, B3, psi), equations)
+end
+
+# Pressure wave, same as for the MHD system.
+function initial_condition_euler(x, t, equations::CompressibleEulerEquations2D)
+    rho = ((1.0 + 0.01 * sin(x[1] * 2 * pi)))
+    v1 = ((0.01 * sin((x[1] - 1 / 2) * 2 * pi)))
+    v2 = 0.0
+    p = rho .^ equations.gamma
+
+    return prim2cons(SVector(rho, v1, v2, p), equations)
+end
+
+# Define the parent mesh.
+coordinates_min = (-2.0, -2.0) # minimum coordinates (min(x), min(y))
+coordinates_max = (2.0, 2.0) # maximum coordinates (max(x), max(y))
+trees_per_dimension = (8, 8)
+# Here we set the priodicity to false for the coupling.
+# Since we couple through the physical boundaries the system is effectively periodic.
+parent_mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                        coordinates_min = coordinates_min,
+                        coordinates_max = coordinates_max,
+                        initial_refinement_level = 0, periodicity = (false, false))
+
+equations1 = IdealGlmMhdEquations2D(5 / 3)
+equations2 = CompressibleEulerEquations2D(5 / 3)
+
+# Define the coupling function between every possible pair of systems.
+coupling_functions = Array{Function}(undef, 2, 2)
+coupling_functions[1, 1] = (x, u, equations_other, equations_own) -> u
+coupling_functions[1, 2] = (x, u, equations_other, equations_own) -> SVector(u[1], u[2],
+                                                                             u[3], 0.0,
+                                                                             u[4], 0.0, 0.0,
+                                                                             0.0, 0.0)
+coupling_functions[2, 1] = (x, u, equations_other, equations_own) -> SVector(u[1], u[2],
+                                                                             u[3], u[5])
+coupling_functions[2, 2] = (x, u, equations_other, equations_own) -> u
+
+# semi 1 MHD.
+cell_ids1 = vcat(Vector(1:8), Vector(32:64))
+mesh1 = P4estMeshView(parent_mesh, cell_ids1)
+volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
+solver1 = DGSEM(polydeg = 3,
+                surface_flux = (flux_lax_friedrichs, flux_nonconservative_powell),
+                volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
+boundary_conditions1 = Dict(:x_neg => BoundaryConditionCoupledP4est(coupling_functions),
+                            :y_neg => BoundaryConditionCoupledP4est(coupling_functions),
+                            :y_pos => BoundaryConditionCoupledP4est(coupling_functions),
+                            :x_pos => BoundaryConditionCoupledP4est(coupling_functions))
+semi1 = SemidiscretizationHyperbolic(mesh1, equations1, initial_condition_mhd, solver1,
+                                     boundary_conditions = boundary_conditions1)
+
+# semi 2 Euler
+cell_ids2 = Vector(9:31)
+mesh2 = P4estMeshView(parent_mesh, cell_ids2)
+solver2 = DGSEM(polydeg = 3, surface_flux = flux_hll,
+                volume_integral = VolumeIntegralWeakForm())
+boundary_conditions2 = Dict(:x_neg => BoundaryConditionCoupledP4est(coupling_functions),
+                            :y_neg => BoundaryConditionCoupledP4est(coupling_functions),
+                            :y_pos => BoundaryConditionCoupledP4est(coupling_functions),
+                            :x_pos => BoundaryConditionCoupledP4est(coupling_functions))
+semi2 = SemidiscretizationHyperbolic(mesh2, equations2, initial_condition_euler, solver2,
+                                     boundary_conditions = boundary_conditions2)
+
+# Create a semidiscretization that bundles semi1 and semi2
+semi = SemidiscretizationCoupledP4est(semi1, semi2)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+ode = semidiscretize(semi, (0.0, 1.0));
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback1 = AnalysisCallback(semi1, interval = 100)
+analysis_callback2 = AnalysisCallback(semi2, interval = 100)
+analysis_callback = AnalysisCallbackCoupled(semi, analysis_callback1, analysis_callback2)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 0.8)
+
+glm_speed_callback = GlmSpeedCallback(glm_scale = 0.5, cfl = 0.8,
+                                      semi_indices = Vector([1]))
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback,
+                        #                         analysis_callback,
+                        save_solution,
+                        stepsize_callback,
+                        glm_speed_callback)
+
+###############################################################################
+# run the simulation
+
+# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false),
+            dt = 0.0001, # solve needs some value here but it will be overwritten by the stepsize_callback
+            save_everystep = false, callback = callbacks);
+
+# Print the timer summary
+summary_callback()
diff --git a/examples/structured_2d_dgsem/Project.toml b/examples/structured_2d_dgsem/Project.toml
new file mode 100644
index 00000000000..e887d5f6494
--- /dev/null
+++ b/examples/structured_2d_dgsem/Project.toml
@@ -0,0 +1,11 @@
+[deps]
+AbbreviatedStackTraces = "ac637c84-cc71-43bf-9c33-c1b4316be3d4"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Infiltrator = "5903a43b-9cc3-4c30-8d17-598619ec4e9b"
+OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
+OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
+OrdinaryDiffEqSSPRK = "669c94d9-1f4b-4b64-b377-1aa079aa2388"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
+Trixi2Vtk = "bc1476a1-1ca6-4cc3-950b-c312b255ff95"
diff --git a/examples/structured_2d_dgsem/test.jl b/examples/structured_2d_dgsem/test.jl
new file mode 100644
index 00000000000..c52662d1f0a
--- /dev/null
+++ b/examples/structured_2d_dgsem/test.jl
@@ -0,0 +1,81 @@
+# #Ignore this
+# using Pkg
+# Pkg.activate("..")
+
+using Trixi
+using OrdinaryDiffEqSSPRK, OrdinaryDiffEqLowStorageRK
+using Plots
+
+equations = ShallowWaterEquations2D(gravity_constant = 9.81)
+# Define the initial condition
+
+function initial_condition_test(x, t, equations::ShallowWaterEquations2D)
+        x1, x2 = x
+        v1 = 0.0
+        v2 = 0.0
+        b = 0.0
+        H = 1.0
+        return  prim2cons(SVector(H, v1, v2, b), equations)
+end
+
+initial_condition = initial_condition_test
+    
+# Define the dirichlet boundary condition
+boundary_conditions = BoundaryConditionDirichlet(initial_condition_test)
+
+# Define the problem solver
+
+volume_flux = (flux_wintermeyer_etal, flux_nonconservative_wintermeyer_etal)
+solver = DGSEM(polydeg = 3,
+               surface_flux = (flux_lax_friedrichs, flux_nonconservative_fjordholm_etal),
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
+
+xl = 0.0;
+yl = 0.0;
+xr = 3.0;
+yr = 3.0;
+coordinates_min = (xl, yl)
+coordinates_max = (xr, yr)
+cells_per_dimension = (30, 30)
+
+# Define the mesh
+
+# If we use tree mesh, it is good.
+#mesh = TreeMesh(coordinates_min, coordinates_max, initial_refinement_level = 5, n_cells_max = 10_000, periodicity = false)         
+
+# If we use a structured mesh is wrong.
+mesh = StructuredMesh(cells_per_dimension, coordinates_min, coordinates_max,
+                      periodicity = false)
+
+# Create the semi discretization object
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    boundary_conditions = boundary_conditions)
+
+###############################################################################
+# ODE solver
+
+tspan = (0.0, 2.0)
+ode = semidiscretize(semi, tspan)
+
+###############################################################################
+# Callbacks
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 1000
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval,
+                                     extra_analysis_integrals = (lake_at_rest_error,))
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+stepsize_callback = StepsizeCallback(cfl = 0.5)
+
+callbacks = CallbackSet(summary_callback, analysis_callback, alive_callback,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/examples/structured_3d_dgsem/Project.toml b/examples/structured_3d_dgsem/Project.toml
new file mode 100644
index 00000000000..b6d1bdb2962
--- /dev/null
+++ b/examples/structured_3d_dgsem/Project.toml
@@ -0,0 +1,10 @@
+[deps]
+AbbreviatedStackTraces = "ac637c84-cc71-43bf-9c33-c1b4316be3d4"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Infiltrator = "5903a43b-9cc3-4c30-8d17-598619ec4e9b"
+OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
+OrdinaryDiffEqSSPRK = "669c94d9-1f4b-4b64-b377-1aa079aa2388"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
+Trixi2Vtk = "bc1476a1-1ca6-4cc3-950b-c312b255ff95"
diff --git a/examples/t8code_2d_dgsem/Project.toml b/examples/t8code_2d_dgsem/Project.toml
new file mode 100644
index 00000000000..6b8f4dc58c1
--- /dev/null
+++ b/examples/t8code_2d_dgsem/Project.toml
@@ -0,0 +1,9 @@
+[deps]
+AbbreviatedStackTraces = "ac637c84-cc71-43bf-9c33-c1b4316be3d4"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Infiltrator = "5903a43b-9cc3-4c30-8d17-598619ec4e9b"
+OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
+Trixi2Vtk = "bc1476a1-1ca6-4cc3-950b-c312b255ff95"
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 6b596bd74fb..96ec136df3e 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -147,6 +147,7 @@ include("semidiscretization/semidiscretization_hyperbolic.jl")
 include("semidiscretization/semidiscretization_hyperbolic_parabolic.jl")
 include("semidiscretization/semidiscretization_euler_acoustics.jl")
 include("semidiscretization/semidiscretization_coupled.jl")
+include("semidiscretization/semidiscretization_coupled_p4est.jl")
 include("time_integration/time_integration.jl")
 include("callbacks_step/callbacks_step.jl")
 include("callbacks_stage/callbacks_stage.jl")
@@ -229,7 +230,7 @@ export boundary_condition_do_nothing,
        BoundaryConditionNavierStokesWall,
        NoSlip, Slip,
        Adiabatic, Isothermal,
-       BoundaryConditionCoupled
+       BoundaryConditionCoupled, BoundaryConditionCoupledP4est
 
 export initial_condition_convergence_test, source_terms_convergence_test,
        source_terms_lorentz, source_terms_collision_ion_electron,
@@ -291,14 +292,14 @@ export SemidiscretizationEulerGravity, ParametersEulerGravity,
        timestep_gravity_erk53_3Sstar!,
        timestep_gravity_carpenter_kennedy_erk54_2N!
 
-export SemidiscretizationCoupled
+export SemidiscretizationCoupled, SemidiscretizationCoupledP4est
 
 export SummaryCallback, SteadyStateCallback, AnalysisCallback, AliveCallback,
        SaveRestartCallback, SaveSolutionCallback, TimeSeriesCallback, VisualizationCallback,
        AveragingCallback,
        AMRCallback, StepsizeCallback,
        GlmSpeedCallback, LBMCollisionCallback, EulerAcousticsCouplingCallback,
-       TrivialCallback, AnalysisCallbackCoupled,
+       TrivialCallback, AnalysisCallbackCoupled, AnalysisCallbackCoupledP4est,
        AnalysisSurfaceIntegral, DragCoefficientPressure2D, LiftCoefficientPressure2D,
        DragCoefficientShearStress2D, LiftCoefficientShearStress2D,
        DragCoefficientPressure3D, LiftCoefficientPressure3D
diff --git a/src/callbacks_step/analysis.jl b/src/callbacks_step/analysis.jl
index 127f077be10..8228dd90a10 100644
--- a/src/callbacks_step/analysis.jl
+++ b/src/callbacks_step/analysis.jl
@@ -118,8 +118,10 @@ function AnalysisCallback(mesh, equations::AbstractEquations, solver, cache;
     #    (total #steps)       (#accepted steps)
     # We need to check the number of accepted steps since callbacks are not
     # activated after a rejected step.
-    condition = (u, t, integrator) -> interval > 0 &&
-        (integrator.stats.naccept % interval == 0 || isfinished(integrator))
+    condition = (u, t,
+    integrator) -> interval > 0 &&
+        (integrator.stats.naccept % interval == 0 ||
+         isfinished(integrator))
 
     analyzer = SolutionAnalyzer(solver; kwargs...)
     cache_analysis = create_cache_analysis(analyzer, mesh, equations, solver, cache,
@@ -156,7 +158,8 @@ function initialize!(cb::DiscreteCallback{Condition, Affect!}, u_ode, du_ode, t,
 
     analysis_callback = cb.affect!
     analysis_callback.initial_state_integrals = initial_state_integrals
-    @unpack save_analysis, output_directory, analysis_filename, analysis_errors, analysis_integrals = analysis_callback
+    @unpack save_analysis, output_directory, analysis_filename, analysis_errors,
+    analysis_integrals = analysis_callback
 
     if save_analysis && mpi_isroot()
         mkpath(output_directory)
@@ -387,8 +390,9 @@ function (analysis_callback::AnalysisCallback)(io, du, u, u_ode, t, semi)
 
     if :l2_error in analysis_errors || :linf_error in analysis_errors
         # Calculate L2/Linf errors
-        l2_error, linf_error = calc_error_norms(u_ode, t, analyzer, semi,
-                                                cache_analysis)
+        l2_error,
+        linf_error = calc_error_norms(u_ode, t, analyzer, semi,
+                                      cache_analysis)
 
         if mpi_isroot()
             # L2 error
@@ -454,8 +458,9 @@ function (analysis_callback::AnalysisCallback)(io, du, u, u_ode, t, semi)
     # L2/L∞ errors of the primitive variables
     if :l2_error_primitive in analysis_errors ||
        :linf_error_primitive in analysis_errors
-        l2_error_prim, linf_error_prim = calc_error_norms(cons2prim, u_ode, t, analyzer,
-                                                          semi, cache_analysis)
+        l2_error_prim,
+        linf_error_prim = calc_error_norms(cons2prim, u_ode, t, analyzer,
+                                           semi, cache_analysis)
 
         if mpi_isroot()
             print(" Variable:    ")
@@ -607,8 +612,9 @@ function (cb::DiscreteCallback{Condition, Affect!})(sol) where {Condition,
     @unpack analyzer = analysis_callback
     cache_analysis = analysis_callback.cache
 
-    l2_error, linf_error = calc_error_norms(sol.u[end], sol.t[end], analyzer, semi,
-                                            cache_analysis)
+    l2_error,
+    linf_error = calc_error_norms(sol.u[end], sol.t[end], analyzer, semi,
+                                  cache_analysis)
     (; l2 = l2_error, linf = linf_error)
 end
 
@@ -703,8 +709,7 @@ end
 # have `VariableViscous` available.
 function analyze(quantity::AnalysisSurfaceIntegral{Variable},
                  du, u, t,
-                 semi::SemidiscretizationHyperbolicParabolic) where {
-                                                                     Variable <:
+                 semi::SemidiscretizationHyperbolicParabolic) where {Variable <:
                                                                      VariableViscous}
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     equations_parabolic = semi.equations_parabolic
diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index fa18c5af63a..4de27ea66f6 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -147,7 +147,8 @@ function calc_error_norms(func, u, t, analyzer,
                           initial_condition, dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
     @unpack node_coordinates, inverse_jacobian = cache.elements
-    @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis
+    @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local,
+    jacobian_tmp1 = cache_analysis
 
     # Set up data structures
     l2_error = zero(func(get_node_vars(u, equations, dg, 1, 1, 1), equations))
@@ -213,6 +214,7 @@ end
 function integrate_via_indices(func::Func, u,
                                mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                            UnstructuredMesh2D, P4estMesh{2},
+                                           P4estMeshView{2},
                                            T8codeMesh{2}},
                                equations,
                                dg::DGSEM, cache, args...; normalize = true) where {Func}
@@ -316,8 +318,10 @@ function analyze(::Val{:l2_divb}, du, u, t,
                                                          dg, cache, derivative_matrix
         divb = zero(eltype(u))
         # Get the contravariant vectors Ja^1 and Ja^2
-        Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors, i, j, element)
-        Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors, i, j, element)
+        Ja11,
+        Ja12 = get_contravariant_vector(1, contravariant_vectors, i, j, element)
+        Ja21,
+        Ja22 = get_contravariant_vector(2, contravariant_vectors, i, j, element)
         # Compute the transformed divergence
         for k in eachnode(dg)
             u_kj = get_node_vars(u, equations, dg, k, j, element)
@@ -377,10 +381,12 @@ function analyze(::Val{:linf_divb}, du, u, t,
         for j in eachnode(dg), i in eachnode(dg)
             divb = zero(eltype(u))
             # Get the contravariant vectors Ja^1 and Ja^2
-            Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
-                                                  i, j, element)
-            Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
-                                                  i, j, element)
+            Ja11,
+            Ja12 = get_contravariant_vector(1, contravariant_vectors,
+                                            i, j, element)
+            Ja21,
+            Ja22 = get_contravariant_vector(2, contravariant_vectors,
+                                            i, j, element)
             # Compute the transformed divergence
             for k in eachnode(dg)
                 u_kj = get_node_vars(u, equations, dg, k, j, element)
diff --git a/src/callbacks_step/glm_speed.jl b/src/callbacks_step/glm_speed.jl
index 3c4089b1652..64a7d3fb5dd 100644
--- a/src/callbacks_step/glm_speed.jl
+++ b/src/callbacks_step/glm_speed.jl
@@ -66,7 +66,7 @@ function GlmSpeedCallback(; glm_scale = 0.5, cfl, semi_indices = Int[])
                                                                                    cfl_function,
                                                                                    semi_indices)
 
-    DiscreteCallback(glm_speed_callback, glm_speed_callback, # the first one is the condition, the second the affect!
+    DiscreteCallback(glm_speed_callback, glm_speed_callback, # the first one is the condition, the second the effect!
                      save_positions = (false, false),
                      initialize = initialize!)
 end
diff --git a/src/callbacks_step/save_solution.jl b/src/callbacks_step/save_solution.jl
index ac40bc42de0..180c1bcc36c 100644
--- a/src/callbacks_step/save_solution.jl
+++ b/src/callbacks_step/save_solution.jl
@@ -49,6 +49,7 @@ mutable struct SaveSolutionCallback{IntervalType, SolutionVariablesType}
     output_directory::String
     solution_variables::SolutionVariablesType
     node_variables::Dict{Symbol, Any}
+    iter_offset::Int
 end
 
 function Base.show(io::IO, cb::DiscreteCallback{<:Any, <:SaveSolutionCallback})
@@ -119,7 +120,8 @@ function SaveSolutionCallback(; interval::Integer = 0,
                               save_final_solution = true,
                               output_directory = "out",
                               solution_variables = cons2prim,
-                              extra_node_variables = ())
+                              extra_node_variables = (),
+                              iter_offset = 0)
     if !isnothing(dt) && interval > 0
         throw(ArgumentError("You can either set the number of steps between output (using `interval`) or the time between outputs (using `dt`) but not both simultaneously"))
     end
@@ -135,7 +137,7 @@ function SaveSolutionCallback(; interval::Integer = 0,
     solution_callback = SaveSolutionCallback(interval_or_dt,
                                              save_initial_solution, save_final_solution,
                                              output_directory, solution_variables,
-                                             node_variables)
+                                             node_variables, iter_offset)
 
     # Expected most frequent behavior comes first
     if isnothing(dt)
@@ -164,7 +166,9 @@ function initialize_save_cb!(solution_callback::SaveSolutionCallback, u, t, inte
     mpi_isroot() && mkpath(solution_callback.output_directory)
 
     semi = integrator.p
-    @trixi_timeit timer() "I/O" save_mesh(semi, solution_callback.output_directory)
+    @trixi_timeit timer() "I/O" save_mesh(semi, solution_callback.output_directory,
+                                          integrator.iter +
+                                          solution_callback.iter_offset)
 
     if solution_callback.save_initial_solution
         solution_callback(integrator)
@@ -237,7 +241,8 @@ function (solution_callback::SaveSolutionCallback)(integrator)
         # Call high-level functions that dispatch on semidiscretization type
         @trixi_timeit timer() "save mesh" save_mesh(semi,
                                                     solution_callback.output_directory,
-                                                    iter)
+                                                    iter +
+                                                    solution_callback.iter_offset)
         save_solution_file(semi, u_ode, solution_callback, integrator)
     end
 
@@ -271,7 +276,10 @@ end
     @trixi_timeit timer() "get node variables" get_node_variables!(solution_callback.node_variables,
                                                                    u_ode, semi)
 
-    @trixi_timeit timer() "save solution" save_solution_file(u_ode, t, dt, iter, semi,
+    @trixi_timeit timer() "save solution" save_solution_file(u_ode, t, dt,
+                                                             iter +
+                                                             solution_callback.iter_offset,
+                                                             semi,
                                                              solution_callback,
                                                              element_variables,
                                                              solution_callback.node_variables,
diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index d0bd3d49f32..9648ae6842c 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -84,7 +84,7 @@ end
 
 function max_dt(u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
-                            T8codeMesh{2}, StructuredMeshView{2}},
+                            P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::False, equations, dg::DG, cache)
     # to avoid a division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
diff --git a/src/meshes/p4est_mesh_view.jl b/src/meshes/p4est_mesh_view.jl
index 38493618a69..9219637b39e 100644
--- a/src/meshes/p4est_mesh_view.jl
+++ b/src/meshes/p4est_mesh_view.jl
@@ -36,6 +36,7 @@ end
 
 @inline Base.ndims(::P4estMeshView{NDIMS}) where {NDIMS} = NDIMS
 @inline Base.real(::P4estMeshView{NDIMS, NDIMS_AMBIENT, RealT}) where {NDIMS, NDIMS_AMBIENT, RealT} = RealT
+@inline ncells(mesh::P4estMeshView) = length(mesh.cell_ids)
 
 function extract_p4est_mesh_view(elements_parent,
                                  interfaces_parent,
@@ -60,14 +61,23 @@ function extract_p4est_mesh_view(elements_parent,
                                                                                    mesh.cell_ids]
     @views elements.surface_flux_values .= elements_parent.surface_flux_values[..,
                                                                                mesh.cell_ids]
-    # Extract interfaces that belong to mesh view
+    # Extract interfaces that belong to mesh view.
     interfaces = extract_interfaces(mesh, interfaces_parent)
 
-    return elements, interfaces, boundaries_parent, mortars_parent
+    # Extract boundaries of this mesh view.
+    boundaries = extract_boundaries(mesh, boundaries_parent, interfaces_parent,
+                                    interfaces)
+
+    # Get the global elements ids of the neighbors.
+    neighbor_ids_global = extract_neighbor_ids_global(mesh, boundaries_parent,
+                                                      interfaces_parent,
+                                                      boundaries)
+
+    return elements, interfaces, boundaries, mortars_parent, neighbor_ids_global
 end
 
 # Remove all interfaces that have a tuple of neighbor_ids where at least one is
-# not part of this meshview, i.e. mesh.cell_ids, and return the new interface container
+# not part of this mesh view, i.e. mesh.cell_ids, and return the new interface container.
 function extract_interfaces(mesh::P4estMeshView, interfaces_parent)
     # Identify interfaces that need to be retained
     mask = BitArray(undef, ninterfaces(interfaces_parent))
@@ -99,24 +109,177 @@ function extract_interfaces(mesh::P4estMeshView, interfaces_parent)
     return interfaces
 end
 
+# Remove all boundaries that are not part of this p4est mesh view and add new boundaries
+# that were interfaces of the parent mesh.
+function extract_boundaries(mesh::P4estMeshView, boundaries_parent, interfaces_parent,
+                            interfaces)
+    # Remove all boundaries that are not part of this p4est mesh view.
+    boundaries = deepcopy(boundaries_parent)
+    mask = BitArray(undef, nboundaries(boundaries_parent))
+    for boundary in 1:size(boundaries_parent.neighbor_ids)[1]
+        mask[boundary] = boundaries_parent.neighbor_ids[boundary] in mesh.cell_ids
+    end
+    boundaries.neighbor_ids = global_element_id_to_local(boundaries_parent.neighbor_ids[mask],
+                                                         mesh)
+    boundaries.name = boundaries_parent.name[mask]
+    boundaries.node_indices = boundaries_parent.node_indices[mask]
+
+    # Add new boundaries that were interfaces of the parent mesh.
+    for interface in 1:size(interfaces_parent.neighbor_ids)[2]
+        if ((interfaces_parent.neighbor_ids[1, interface] in mesh.cell_ids) ⊻
+            (interfaces_parent.neighbor_ids[2, interface] in mesh.cell_ids))
+            # Determine which of the ids is part of the mesh view.
+            if interfaces_parent.neighbor_ids[1, interface] in mesh.cell_ids
+                neighbor_id = interfaces_parent.neighbor_ids[1, interface]
+                view_idx = 1
+            else
+                neighbor_id = interfaces_parent.neighbor_ids[2, interface]
+                view_idx = 2
+            end
+
+            push!(boundaries.neighbor_ids,
+                  global_element_id_to_local(neighbor_id, mesh))
+            if interfaces_parent.node_indices[view_idx, interface] == (:end, :i_forward)
+                push!(boundaries.name, :x_pos)
+            elseif interfaces_parent.node_indices[view_idx, interface] ==
+                   (:begin, :i_forward)
+                push!(boundaries.name, :x_neg)
+            elseif interfaces_parent.node_indices[view_idx, interface] ==
+                   (:i_forward, :end)
+                push!(boundaries.name, :y_pos)
+            else
+                push!(boundaries.name, :y_neg)
+            end
+
+            push!(boundaries.node_indices,
+                  interfaces_parent.node_indices[view_idx, interface])
+        end
+    end
+
+    # Create the boundary vector for u, which will be populated later.
+    boundaries.u = zeros(typeof(boundaries_parent.u).parameters[1],
+                         (size(boundaries_parent.u)[1], size(boundaries_parent.u)[2],
+                          size(boundaries.node_indices)[end]))
+
+    return boundaries
+end
+
+# Extract the ids of the neighboring elements using the global indexing of the parent mesh.
+function extract_neighbor_ids_global(mesh::P4estMeshView, boundaries_parent,
+                                     interfaces_parent,
+                                     boundaries)
+    # Determine the global indices of the boundaring elements.
+    neighbor_ids_global = zero.(boundaries.neighbor_ids)
+    for (idx, id) in enumerate(boundaries.neighbor_ids)
+        global_id = mesh.cell_ids[id]
+        # Find this id in the parent's interfaces.
+        for interface in eachindex(interfaces_parent.neighbor_ids[1, :])
+            if global_id == interfaces_parent.neighbor_ids[1, interface] ||
+               global_id == interfaces_parent.neighbor_ids[2, interface]
+                if global_id == interfaces_parent.neighbor_ids[1, interface]
+                    matching_boundary = 1
+                else
+                    matching_boundary = 2
+                end
+                # Check if interfaces with this id have the right name/node_indices.
+                if boundaries.name[idx] ==
+                   node_indices_to_name(interfaces_parent.node_indices[matching_boundary,
+                                                                       interface])
+                    if global_id == interfaces_parent.neighbor_ids[1, interface]
+                        neighbor_ids_global[idx] = interfaces_parent.neighbor_ids[2,
+                                                                                  interface]
+                    else
+                        neighbor_ids_global[idx] = interfaces_parent.neighbor_ids[1,
+                                                                                  interface]
+                    end
+                end
+            end
+        end
+
+        # Find this id in the parent's boundaries.
+        parent_xneg_element_ids = boundaries_parent.neighbor_ids[boundaries_parent.name .== :x_neg]
+        parent_xpos_element_ids = boundaries_parent.neighbor_ids[boundaries_parent.name .== :x_pos]
+        parent_yneg_element_ids = boundaries_parent.neighbor_ids[boundaries_parent.name .== :y_neg]
+        parent_ypos_element_ids = boundaries_parent.neighbor_ids[boundaries_parent.name .== :y_pos]
+        for (parent_idx, boundary) in enumerate(boundaries_parent.neighbor_ids)
+            if global_id == boundary
+                # Check if boundaries with this id have the right name/node_indices.
+                if boundaries.name[idx] == boundaries_parent.name[parent_idx]
+                    # Make the coupling periodic.
+                    if boundaries_parent.name[parent_idx] == :x_neg
+                        neighbor_ids_global[idx] = parent_xpos_element_ids[findfirst(parent_xneg_element_ids .==
+                                                                                     boundary)]
+                    end
+                    if boundaries_parent.name[parent_idx] == :x_pos
+                        neighbor_ids_global[idx] = parent_xneg_element_ids[findfirst(parent_xpos_element_ids .==
+                                                                                     boundary)]
+                    end
+                    if boundaries_parent.name[parent_idx] == :y_neg
+                        neighbor_ids_global[idx] = parent_ypos_element_ids[findfirst(parent_yneg_element_ids .==
+                                                                                     boundary)]
+                    end
+                    if boundaries_parent.name[parent_idx] == :y_pos
+                        neighbor_ids_global[idx] = parent_yneg_element_ids[findfirst(parent_ypos_element_ids .==
+                                                                                     boundary)]
+                    end
+                end
+            end
+        end
+    end
+
+    return neighbor_ids_global
+end
+
+# Translate the interface indices into boundary names.
+function node_indices_to_name(node_index)
+    if node_index == (:end, :i_forward)
+        return :x_pos
+    elseif node_index == (:begin, :i_forward)
+        return :x_neg
+    elseif node_index == (:i_forward, :end)
+        return :y_pos
+    elseif node_index == (:i_forward, :begin)
+        return :y_neg
+    else
+        error("Unknown node index: $node_index")
+    end
+end
+
+# Convert a global cell id to a local cell id in the mesh view.
+function global_element_id_to_local(id::Int, mesh::P4estMeshView)
+    # Return -1 if the id is not part of the mesh view.
+    local_id = -1
+    # Find the index of the cell id in the mesh view
+    local_id = findfirst(==(id), mesh.cell_ids)
+
+    return local_id
+end
+
+# Convert an array of global cell ids to a local cell id in the mesh view.
+function global_element_id_to_local(id::AbstractArray, mesh::P4estMeshView)
+    # Find the index of the cell id in the mesh view
+    local_id = zeros(Int, length(id))
+    for i in eachindex(id)
+        local_id[i] = global_element_id_to_local(id[i], mesh)
+    end
+
+    return local_id
+end
+
 # Does not save the mesh itself to an HDF5 file. Instead saves important attributes
 # of the mesh, like its size and the type of boundary mapping function.
 # Then, within Trixi2Vtk, the P4estMeshView and its node coordinates are reconstructured from
 # these attributes for plotting purposes
 # | Warning: This overwrites any existing mesh file, either for a mesh view or parent mesh.
-function save_mesh_file(mesh::P4estMeshView, output_directory, timestep,
-                        mpi_parallel::False)
+function save_mesh_file(mesh::P4estMeshView, output_directory; system = "",
+                        timestep = 0)
     # Create output directory (if it does not exist)
     mkpath(output_directory)
 
     # Determine file name based on existence of meaningful time step
-    if timestep > 0
-        filename = joinpath(output_directory, @sprintf("mesh_%09d.h5", timestep))
-        p4est_filename = @sprintf("p4est_data_%09d", timestep)
-    else
-        filename = joinpath(output_directory, "mesh.h5")
-        p4est_filename = "p4est_data"
-    end
+    filename = joinpath(output_directory,
+                        @sprintf("mesh_%s_%09d.h5", system, timestep))
+    p4est_filename = @sprintf("p4est_%s_data_%09d", system, timestep)
 
     p4est_file = joinpath(output_directory, p4est_filename)
 
diff --git a/src/semidiscretization/semidiscretization_coupled_p4est.jl b/src/semidiscretization/semidiscretization_coupled_p4est.jl
new file mode 100644
index 00000000000..f4a222b9900
--- /dev/null
+++ b/src/semidiscretization/semidiscretization_coupled_p4est.jl
@@ -0,0 +1,594 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+"""
+    SemidiscretizationCoupledP4est
+
+Specialized semidiscretization routines for coupled problems using P4est mesh views.
+This is analogous to the implementation for structured meshes.
+[`semidiscretize`](@ref) will return an `ODEProblem` that synchronizes time steps between the semidiscretizations.
+Each call of `rhs!` will call `rhs!` for each semidiscretization individually.
+The semidiscretizations can be coupled by gluing meshes together using [`BoundaryConditionCoupled`](@ref).
+
+!!! warning "Experimental code"
+    This is an experimental feature and can change any time.
+"""
+mutable struct SemidiscretizationCoupledP4est{Semis, Indices, EquationList} <:
+               AbstractSemidiscretization
+    semis::Semis
+    u_indices::Indices # u_ode[u_indices[i]] is the part of u_ode corresponding to semis[i]
+    performance_counter::PerformanceCounter
+    global_element_ids::Vector{Int}
+    local_element_ids::Vector{Int}
+    element_offset::Vector{Int}
+    mesh_ids::Vector{Int}
+end
+
+"""
+    SemidiscretizationCoupledP4est(semis...)
+
+Create a coupled semidiscretization that consists of the semidiscretizations passed as arguments.
+"""
+function SemidiscretizationCoupledP4est(semis...)
+    @assert all(semi -> ndims(semi) == ndims(semis[1]), semis) "All semidiscretizations must have the same dimension!"
+
+    # Number of coefficients for each semidiscretization
+    n_coefficients = zeros(Int, length(semis))
+    for i in 1:length(semis)
+        _, equations, _, _ = mesh_equations_solver_cache(semis[i])
+        n_coefficients[i] = ndofs(semis[i]) * nvariables(equations)
+    end
+
+    # Compute range of coefficients associated with each semidiscretization
+    u_indices = Vector{UnitRange{Int}}(undef, length(semis))
+    for i in 1:length(semis)
+        offset = sum(n_coefficients[1:(i - 1)]) + 1
+        u_indices[i] = range(offset, length = n_coefficients[i])
+    end
+
+    # Create correspondence between global (to the parent mesh) cell IDs and local (to the mesh view) cell IDs.
+    global_element_ids = 1:size(semis[1].mesh.parent.tree_node_coordinates)[end]
+    local_element_ids = zeros(Int, size(global_element_ids))
+    element_offset = ones(Int, length(semis))
+    mesh_ids = zeros(Int, size(global_element_ids))
+    for i in 1:length(semis)
+        local_element_ids[semis[i].mesh.cell_ids] = global_element_id_to_local(global_element_ids[semis[i].mesh.cell_ids],
+                                                                               semis[i].mesh)
+        mesh_ids[semis[i].mesh.cell_ids] .= i
+    end
+
+    performance_counter = PerformanceCounter()
+
+    SemidiscretizationCoupledP4est{typeof(semis), typeof(u_indices),
+                                   typeof(performance_counter)}(semis, u_indices,
+                                                                performance_counter,
+                                                                global_element_ids,
+                                                                local_element_ids,
+                                                                element_offset,
+                                                                mesh_ids)
+end
+
+function Base.show(io::IO, semi::SemidiscretizationCoupledP4est)
+    @nospecialize semi # reduce precompilation time
+
+    print(io, "SemidiscretizationCoupledP4est($(semi.semis))")
+end
+
+function Base.show(io::IO, ::MIME"text/plain", semi::SemidiscretizationCoupledP4est)
+    @nospecialize semi # reduce precompilation time
+
+    if get(io, :compact, false)
+        show(io, semi)
+    else
+        summary_header(io, "SemidiscretizationCoupledP4est")
+        summary_line(io, "#spatial dimensions", ndims(semi.semis[1]))
+        summary_line(io, "#systems", nsystems(semi))
+        for i in eachsystem(semi)
+            summary_line(io, "system", i)
+            mesh, equations, solver, _ = mesh_equations_solver_cache(semi.semis[i])
+            summary_line(increment_indent(io), "mesh", mesh |> typeof |> nameof)
+            summary_line(increment_indent(io), "equations",
+                         equations |> typeof |> nameof)
+            summary_line(increment_indent(io), "initial condition",
+                         semi.semis[i].initial_condition)
+            # no boundary conditions since that could be too much
+            summary_line(increment_indent(io), "source terms",
+                         semi.semis[i].source_terms)
+            summary_line(increment_indent(io), "solver", solver |> typeof |> nameof)
+        end
+        summary_line(io, "total #DOFs per field", ndofsglobal(semi))
+        summary_footer(io)
+    end
+end
+
+function print_summary_semidiscretization(io::IO, semi::SemidiscretizationCoupledP4est)
+    show(io, MIME"text/plain"(), semi)
+    println(io, "\n")
+    for i in eachsystem(semi)
+        mesh, equations, solver, _ = mesh_equations_solver_cache(semi.semis[i])
+        summary_header(io, "System #$i")
+
+        summary_line(io, "mesh", mesh |> typeof |> nameof)
+        show(increment_indent(io), MIME"text/plain"(), mesh)
+
+        summary_line(io, "equations", equations |> typeof |> nameof)
+        show(increment_indent(io), MIME"text/plain"(), equations)
+
+        summary_line(io, "solver", solver |> typeof |> nameof)
+        show(increment_indent(io), MIME"text/plain"(), solver)
+
+        summary_footer(io)
+        println(io, "\n")
+    end
+end
+
+@inline Base.ndims(semi::SemidiscretizationCoupledP4est) = ndims(semi.semis[1])
+
+@inline nsystems(semi::SemidiscretizationCoupledP4est) = length(semi.semis)
+
+@inline eachsystem(semi::SemidiscretizationCoupledP4est) = Base.OneTo(nsystems(semi))
+
+@inline Base.real(semi::SemidiscretizationCoupledP4est) = promote_type(real.(semi.semis)...)
+
+@inline function Base.eltype(semi::SemidiscretizationCoupledP4est)
+    promote_type(eltype.(semi.semis)...)
+end
+
+@inline function ndofs(semi::SemidiscretizationCoupledP4est)
+    sum(ndofs, semi.semis)
+end
+
+"""
+    ndofsglobal(semi::SemidiscretizationCoupledP4est)
+
+Return the global number of degrees of freedom associated with each scalar variable across all MPI ranks, and summed up over all coupled systems.
+This is the same as [`ndofs`](@ref) for simulations running in serial or
+parallelized via threads. It will in general be different for simulations
+running in parallel with MPI.
+"""
+@inline function ndofsglobal(semi::SemidiscretizationCoupledP4est)
+    sum(ndofsglobal, semi.semis)
+end
+
+function compute_coefficients(t, semi::SemidiscretizationCoupledP4est)
+    @unpack u_indices = semi
+
+    u_ode = Vector{real(semi)}(undef, u_indices[end][end])
+
+    for i in eachsystem(semi)
+        # Call `compute_coefficients` in `src/semidiscretization/semidiscretization.jl`
+        u_ode[u_indices[i]] .= compute_coefficients(t, semi.semis[i])
+    end
+
+    return u_ode
+end
+
+@inline function get_system_u_ode(u_ode, index, semi::SemidiscretizationCoupledP4est)
+    return @view u_ode[semi.u_indices[index]]
+end
+
+function rhs!(du_ode, u_ode, semi::SemidiscretizationCoupledP4est, t)
+    time_start = time_ns()
+
+    n_nodes = length(semi.semis[1].mesh.parent.nodes)
+
+    # Determine how many ndofs*nvariables we have in the global solutions array.
+    global ndofs_nvars_global = 0
+    foreach_enumerate(semi.semis) do (i, semi_)
+        global ndofs_nvars_global
+        ndofs_nvars_global += nvariables(semi_.equations) * length(semi_.mesh.cell_ids)
+    end
+
+    # Determine the element indecx offset for the global solutions array.
+    # @autoinfiltrate
+    for i in 2:nsystems(semi)
+        semi.element_offset[i] = semi.element_offset[i - 1] +
+                                 n_nodes^2 * nvariables(semi.semis[i - 1]) *
+                                 length(semi.semis[i - 1].mesh.cell_ids)
+    end
+
+    # Create the global solution vector.
+    u_global = Vector{real(semi)}(undef, ndofs_nvars_global * n_nodes^2)
+
+    # Extract the global solution vector from the local solutions.
+    foreach_enumerate(semi.semis) do (i, semi_)
+        u_loc = get_system_u_ode(u_ode, i, semi)
+        u_loc_reshape = reshape(u_loc,
+                                (nvariables(semi_.equations),
+                                 n_nodes, n_nodes,
+                                 Int(length(u_loc) /
+                                     (n_nodes^2 * nvariables(semi_.equations)))))
+        for i_node in 1:n_nodes, j_node in 1:n_nodes,
+            element in 1:Int(ndofs(semi) / n_nodes^2)
+
+            if element in semi_.mesh.cell_ids
+                for var in 1:nvariables(semi_.equations)
+                    u_global[semi.element_offset[i] +
+                    (var - 1) +
+                    nvariables(semi_.equations) * (i_node - 1) +
+                    nvariables(semi_.equations) * n_nodes * (j_node - 1) +
+                    nvariables(semi_.equations) * n_nodes^2 * (global_element_id_to_local(element, semi_.mesh) - 1)] = u_loc_reshape[var,
+                                                                                                                                     i_node,
+                                                                                                                                     j_node,
+                                                                                                                                     global_element_id_to_local(element,
+                                                                                                                                                                semi_.mesh)]
+                end
+            end
+        end
+    end
+
+    # Call rhs! for each semidiscretization
+    foreach_enumerate(semi.semis) do (i, semi_)
+        u_loc = get_system_u_ode(u_ode, i, semi)
+        du_loc = get_system_u_ode(du_ode, i, semi)
+        rhs!(du_loc, u_loc, u_global, semi, semi_, t)
+    end
+
+    runtime = time_ns() - time_start
+    put!(semi.performance_counter, runtime)
+
+    return nothing
+end
+
+################################################################################
+### AnalysisCallback
+################################################################################
+
+"""
+    AnalysisCallbackCoupledP4est(semi, callbacks...)
+Combine multiple analysis callbacks for coupled simulations with a
+[`SemidiscretizationCoupled`](@ref). For each coupled system, an indididual
+[`AnalysisCallback`](@ref) **must** be created and passed to the `AnalysisCallbackCoupledP4est` **in
+order**, i.e., in the same sequence as the indidvidual semidiscretizations are stored in the
+`SemidiscretizationCoupled`.
+!!! warning "Experimental code"
+    This is an experimental feature and can change any time.
+"""
+struct AnalysisCallbackCoupledP4est{CB}
+    callbacks::CB
+end
+
+# Convenience constructor for the coupled callback that gets called directly from the elixirs
+function AnalysisCallbackCoupledP4est(semi_coupled, callbacks...)
+    if length(callbacks) != nsystems(semi_coupled)
+        error("an AnalysisCallbackCoupledP4est requires one AnalysisCallback for each semidiscretization")
+    end
+
+    analysis_callback_coupled = AnalysisCallbackCoupledP4est{typeof(callbacks)}(callbacks)
+
+    # This callback is triggered if any of its subsidiary callbacks' condition is triggered
+    condition = (u, t, integrator) -> any(callbacks) do callback
+        callback.condition(u, t, integrator)
+    end
+
+    DiscreteCallback(condition, analysis_callback_coupled,
+                     save_positions = (false, false),
+                     initialize = initialize!)
+end
+
+# This method gets called during initialization from OrdinaryDiffEq's `solve(...)`
+function initialize!(cb_coupled::DiscreteCallback{Condition, Affect!}, u_ode_coupled, t,
+                     integrator) where {Condition,
+                                        Affect! <: AnalysisCallbackCoupledP4est}
+    analysis_callback_coupled = cb_coupled.affect!
+    semi_coupled = integrator.p
+    du_ode_coupled = first(get_tmp_cache(integrator))
+
+    # Loop over coupled systems' callbacks and initialize them individually
+    # for i in eachsystem(semi_coupled)
+    #     cb = analysis_callback_coupled.callbacks[i]
+    #     semi = semi_coupled.semis[i]
+    #     u_ode = get_system_u_ode(u_ode_coupled, i, semi_coupled)
+    #     du_ode = get_system_u_ode(du_ode_coupled, i, semi_coupled)
+    #     initialize!(cb, u_ode, du_ode, t, integrator, semi)
+    # end
+end
+
+# This method gets called from OrdinaryDiffEq's `solve(...)`
+function (analysis_callback_coupled::AnalysisCallbackCoupledP4est)(integrator)
+    semi_coupled = integrator.p
+    u_ode_coupled = integrator.u
+    du_ode_coupled = first(get_tmp_cache(integrator))
+
+    # Loop over coupled systems' callbacks and call them individually
+    # Due to changes in the rhs! calulcations this currently does not work.
+    # Taal: Get a proper coupled analysis callback working.
+    # for i in eachsystem(semi_coupled)
+    #     @unpack condition = analysis_callback_coupled.callbacks[i]
+    #     analysis_callback = analysis_callback_coupled.callbacks[i].affect!
+    #     u_ode = get_system_u_ode(u_ode_coupled, i, semi_coupled)
+
+    #     # Check condition and skip callback if it is not yet its turn
+    #     if !condition(u_ode, integrator.t, integrator)
+    #         continue
+    #     end
+
+    #     semi = semi_coupled.semis[i]
+    #     du_ode = get_system_u_ode(du_ode_coupled, i, semi_coupled)
+    #     analysis_callback(u_ode, du_ode, integrator, semi)
+    # end
+end
+
+# used for error checks and EOC analysis
+function (cb::DiscreteCallback{Condition, Affect!})(sol) where {Condition,
+                                                                Affect! <:
+                                                                AnalysisCallbackCoupledP4est
+                                                                }
+    semi_coupled = sol.prob.p
+    u_ode_coupled = sol.u[end]
+    @unpack callbacks = cb.affect!
+
+    uEltype = real(semi_coupled)
+    error_indices = Array([
+                              1,
+                              1 .+
+                              cumsum(nvariables(semi_coupled.semis[i].equations)
+                                     for i in eachindex(semi_coupled.semis))[begin:end]...
+                          ])
+    length_error_array = sum(nvariables(semi_coupled.semis[i].equations)
+                             for i in eachindex(semi_coupled.semis))
+    l2_error_collection = uEltype[]
+    linf_error_collection = uEltype[]
+
+    for i in eachsystem(semi_coupled)
+        #     analysis_callback = callbacks[i].affect!
+        #     @unpack analyzer = analysis_callback
+        #     cache_analysis = analysis_callback.cache
+
+        #     semi = semi_coupled.semis[i]
+        #     u_ode = get_system_u_ode(u_ode_coupled, i, semi_coupled)
+
+        # Due to changes in the rhs! calulcations this currently does not work.
+        # Taal: Get a proper coupled analysis callback working.
+        #     l2_error, linf_error = calc_error_norms(u_ode, sol.t[end], analyzer, semi,
+        #                                             cache_analysis)
+        l2_error = zeros(nvariables(semi_coupled.semis[i].equations))
+        linf_error = zeros(nvariables(semi_coupled.semis[i].equations))
+        append!(l2_error_collection, l2_error)
+        append!(linf_error_collection, linf_error)
+    end
+
+    (; l2 = l2_error_collection, linf = linf_error_collection)
+end
+
+################################################################################
+### SaveSolutionCallback
+################################################################################
+
+# Save mesh for a coupled semidiscretization, which contains multiple meshes internally
+function save_mesh(semi::SemidiscretizationCoupledP4est, output_directory, timestep = 0)
+    for i in eachsystem(semi)
+        mesh, _, _, _ = mesh_equations_solver_cache(semi.semis[i])
+
+        if mesh.unsaved_changes
+            mesh.current_filename = save_mesh_file(mesh, output_directory;
+                                                   system = string(i),
+                                                   timestep = timestep)
+            mesh.unsaved_changes = false
+        end
+    end
+end
+
+@inline function save_solution_file(semi::SemidiscretizationCoupledP4est, u_ode,
+                                    solution_callback,
+                                    integrator)
+    @unpack semis = semi
+
+    for i in eachsystem(semi)
+        u_ode_slice = get_system_u_ode(u_ode, i, semi)
+        save_solution_file(semis[i], u_ode_slice, solution_callback, integrator,
+                           system = i)
+    end
+end
+
+################################################################################
+### StepsizeCallback
+################################################################################
+
+# In case of coupled system, use minimum timestep over all systems
+# Case for constant `cfl_number`.
+function calculate_dt(u_ode, t, cfl_advective, cfl_diffusive,
+                      semi::SemidiscretizationCoupledP4est)
+    dt = minimum(eachsystem(semi)) do i
+        u_ode_slice = get_system_u_ode(u_ode, i, semi)
+        calculate_dt(u_ode_slice, t, cfl_advective, cfl_diffusive, semi.semis[i])
+    end
+
+    return dt
+end
+
+function update_cleaning_speed!(semi_coupled::SemidiscretizationCoupledP4est,
+                                glm_speed_callback, dt, t)
+    @unpack glm_scale, cfl, semi_indices = glm_speed_callback
+
+    if length(semi_indices) == 0
+        throw("Since you have more than one semidiscretization you need to specify the 'semi_indices' for which the GLM speed needs to be calculated.")
+    end
+
+    # Check that all MHD semidiscretizations received a GLM cleaning speed update.
+    for (semi_index, semi) in enumerate(semi_coupled.semis)
+        if (typeof(semi.equations) <: AbstractIdealGlmMhdEquations &&
+            !(semi_index in semi_indices))
+            error("Equation of semidiscretization $semi_index needs to be included in 'semi_indices' of 'GlmSpeedCallback'.")
+        end
+    end
+
+    if cfl isa Real # Case for constant CFL
+        cfl_number = cfl
+    else # Variable CFL
+        cfl_number = cfl(t)
+    end
+
+    for semi_index in semi_indices
+        semi = semi_coupled.semis[semi_index]
+        mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
+
+        # compute time step for GLM linear advection equation with c_h=1 (redone due to the possible AMR)
+        c_h_deltat = calc_dt_for_cleaning_speed(cfl_number,
+                                                mesh, equations, solver, cache)
+
+        # c_h is proportional to its own time step divided by the complete MHD time step
+        # We use @reset here since the equations are immutable (to work on GPUs etc.).
+        # Thus, we need to modify the equations field of the semidiscretization.
+        @reset equations.c_h = glm_scale * c_h_deltat / dt
+        semi.equations = equations
+    end
+
+    return semi_coupled
+end
+
+################################################################################
+### Equations
+################################################################################
+
+"""
+    BoundaryConditionCoupled(other_semi_index, indices, uEltype, coupling_converter)
+
+Boundary condition to glue two meshes together. Solution values at the boundary
+of another mesh will be used as boundary values. This requires the use
+of [`SemidiscretizationCoupled`](@ref). The other mesh is specified by `other_semi_index`,
+which is the index of the mesh in the tuple of semidiscretizations.
+
+Note that the elements and nodes of the two meshes at the coupled boundary must coincide.
+This is currently only implemented for [`StructuredMesh`](@ref).
+
+# Arguments
+- `other_semi_index`: the index in `SemidiscretizationCoupled` of the semidiscretization
+                      from which the values are copied
+- `indices::Tuple`: node/cell indices at the boundary of the mesh in the other
+                    semidiscretization. See examples below.
+- `uEltype::Type`: element type of solution
+- `coupling_converter::CouplingConverter`: function to call for converting the solution
+                                           state of one system to the other system
+
+# Examples
+```julia
+# Connect the left boundary of mesh 2 to our boundary such that our positive
+# boundary direction will match the positive y direction of the other boundary
+BoundaryConditionCoupled(2, (:begin, :i), Float64, fun)
+
+# Connect the same two boundaries oppositely oriented
+BoundaryConditionCoupled(2, (:begin, :i_backwards), Float64, fun)
+
+# Using this as y_neg boundary will connect `our_cells[i, 1, j]` to `other_cells[j, end-i, end]`
+BoundaryConditionCoupled(2, (:j, :i_backwards, :end), Float64, fun)
+```
+
+!!! warning "Experimental code"
+    This is an experimental feature and can change any time.
+"""
+mutable struct BoundaryConditionCoupledP4est{CouplingConverter}
+    coupling_converter::CouplingConverter
+
+    function BoundaryConditionCoupledP4est(coupling_converter)
+        new{typeof(coupling_converter)}(coupling_converter)
+    end
+end
+
+function Base.eltype(boundary_condition::BoundaryConditionCoupledP4est)
+    eltype(boundary_condition.u_boundary)
+end
+
+"""
+Extract the boundary values from the neighboring element.
+This requires values from other mesh views.
+This currently only works for Cartesian meshes.
+"""
+function (boundary_condition::BoundaryConditionCoupledP4est)(u_inner, mesh, equations,
+                                                             cache,
+                                                             i_index, j_index,
+                                                             element_index,
+                                                             normal_direction,
+                                                             surface_flux_function,
+                                                             direction,
+                                                             u_global, semi)
+    n_nodes = length(mesh.parent.nodes)
+    if abs(sum(normal_direction .* (1.0, 0.0))) >
+       abs(sum(normal_direction .* (0.0, 1.0)))
+        if sum(normal_direction .* (1.0, 0.0)) > sum(normal_direction .* (-1.0, 0.0))
+            element_index_global = cache.neighbor_ids_global[findfirst((cache.boundaries.name .==
+                                                                        :x_pos) .*
+                                                                       (cache.boundaries.neighbor_ids .==
+                                                                        element_index))]
+        else
+            element_index_global = cache.neighbor_ids_global[findfirst((cache.boundaries.name .==
+                                                                        :x_neg) .*
+                                                                       (cache.boundaries.neighbor_ids .==
+                                                                        element_index))]
+        end
+        i_index_g = i_index
+        if i_index == n_nodes
+            i_index_g = 1
+        elseif i_index == 1
+            i_index_g = n_nodes
+        end
+        j_index_g = j_index
+    else
+        if sum(normal_direction .* (0.0, 1.0)) > sum(normal_direction .* (0.0, -1.0))
+            element_index_global = cache.neighbor_ids_global[findfirst((cache.boundaries.name .==
+                                                                        :y_pos) .*
+                                                                       (cache.boundaries.neighbor_ids .==
+                                                                        element_index))]
+        else
+            element_index_global = cache.neighbor_ids_global[findfirst((cache.boundaries.name .==
+                                                                        :y_neg) .*
+                                                                       (cache.boundaries.neighbor_ids .==
+                                                                        element_index))]
+        end
+        j_index_g = j_index
+        if j_index == n_nodes
+            j_index_g = 1
+        elseif j_index == 1
+            j_index_g = n_nodes
+        end
+        i_index_g = i_index
+    end
+
+    # Determine the index and semi of the other semidiscretization.
+    idx_other = semi.mesh_ids[element_index_global]
+    semi_other = semi.semis[idx_other]
+
+    # Determine the index of this semidiscretization.
+    idx_this = semi.mesh_ids[mesh.cell_ids[element_index]]
+
+    # @autoinfiltrate
+    # Get the neighboring value and convert it.
+    u_boundary_to_convert = Vector{real(semi)}(undef, nvariables(semi_other.equations))
+    for var in 1:nvariables(semi_other.equations)
+        u_boundary_to_convert[var] = u_global[semi.element_offset[idx_other] +
+                                              (var - 1) +
+                                              nvariables(semi_other.equations) * (i_index_g - 1) +
+                                              nvariables(semi_other.equations) * n_nodes * (j_index_g - 1) +
+                                              nvariables(semi_other.equations) * n_nodes^2 * (global_element_id_to_local(element_index_global, semi_other.mesh) - 1)]
+    end
+    x = cache.elements.node_coordinates[:, i_index, j_index, element_index]
+    u_boundary = boundary_condition.coupling_converter[idx_this, idx_other](x,
+                                                                            u_boundary_to_convert,
+                                                                            equations,
+                                                                            equations)
+
+    orientation = normal_direction
+
+    # Calculate boundary flux
+    if surface_flux_function isa Tuple
+        # In case of conservative (index 1) and non-conservative (index 2) fluxes,
+        # add the non-conservative one with a factor of 1/2.
+        flux = (surface_flux_function[1](u_inner, u_boundary, orientation,
+                                         equations) +
+                0.5f0 *
+                surface_flux_function[2](u_inner, u_boundary, orientation,
+                                         equations))
+        # Look into the flux flux function and if we supply correct values. They might be caused by improper 'orientation' too.
+        # Check if there are any uinitialized values. Perhaps within the flux function.
+        # Run with boundscheck on.
+    else
+        flux = surface_flux_function(u_inner, u_boundary, orientation, equations)
+    end
+
+    return flux
+end
+end # @muladd
diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl
index 2a563c02229..8a7eec9a802 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic.jl
@@ -409,4 +409,20 @@ function rhs!(du_ode, u_ode, semi::SemidiscretizationHyperbolic, t)
 
     return nothing
 end
+
+function rhs!(du_ode, u_ode, u_global, semis,
+              semi::SemidiscretizationHyperbolic, t)
+    @unpack mesh, equations, boundary_conditions, source_terms, solver, cache = semi
+
+    u = wrap_array(u_ode, mesh, equations, solver, cache)
+    du = wrap_array(du_ode, mesh, equations, solver, cache)
+
+    time_start = time_ns()
+    @trixi_timeit timer() "rhs!" rhs!(du, u, t, u_global, semis, mesh, equations,
+                                      boundary_conditions, source_terms, solver, cache)
+    runtime = time_ns() - time_start
+    put!(semi.performance_counter, runtime)
+
+    return nothing
+end
 end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg.jl b/src/solvers/dgsem_p4est/dg.jl
index b0a8f0e07b0..f3dd57c922e 100644
--- a/src/solvers/dgsem_p4est/dg.jl
+++ b/src/solvers/dgsem_p4est/dg.jl
@@ -46,16 +46,16 @@ function create_cache(mesh::P4estMeshView, equations::AbstractEquations, dg::DG,
     mortars_parent = init_mortars(mesh.parent, equations, dg.basis, elements_parent)
 
     # Extract data for views.
-    elements, interfaces, boundaries, mortars = extract_p4est_mesh_view(elements_parent,
-                                                                        interfaces_parent,
-                                                                        boundaries_parent,
-                                                                        mortars_parent,
-                                                                        mesh,
-                                                                        equations,
-                                                                        dg,
-                                                                        uEltype)
-
-    cache = (; elements, interfaces, boundaries, mortars)
+    elements, interfaces, boundaries, mortars, neighbor_ids_global = extract_p4est_mesh_view(elements_parent,
+                                                                                             interfaces_parent,
+                                                                                             boundaries_parent,
+                                                                                             mortars_parent,
+                                                                                             mesh,
+                                                                                             equations,
+                                                                                             dg,
+                                                                                             uEltype)
+
+    cache = (; elements, interfaces, boundaries, mortars, neighbor_ids_global)
 
     # Add specialized parts of the cache required to compute the volume integral etc.
     cache = (; cache...,
diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index 47742895b25..4713a830863 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -216,8 +216,9 @@ end
 
 # Inlined version of the interface flux computation for equations with conservative and nonconservative terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
-                                      have_nonconservative_terms::True, equations,
+                                      mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                                  T8codeMesh{2}},
+                                      nonconservative_terms::True, equations,
                                       surface_integral, dg::DG, cache,
                                       interface_index, normal_direction,
                                       primary_node_index, primary_direction_index,
@@ -281,6 +282,35 @@ function prolong2boundaries!(cache, u,
     return nothing
 end
 
+function prolong2boundaries!(cache, u, u_global, semis,
+                             mesh::P4estMeshView{2},
+                             equations, surface_integral, dg::DG)
+    @unpack interfaces, boundaries = cache
+    index_range = eachnode(dg)
+
+    @threaded for boundary in eachboundary(dg, cache)
+        # Copy solution data from the element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        element = boundaries.neighbor_ids[boundary]
+        node_indices = boundaries.node_indices[boundary]
+
+        i_node_start, i_node_step = index_to_start_step_2d(node_indices[1], index_range)
+        j_node_start, j_node_step = index_to_start_step_2d(node_indices[2], index_range)
+
+        i_node = i_node_start
+        j_node = j_node_start
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                boundaries.u[v, i, boundary] = u[v, i_node, j_node, element]
+            end
+            i_node += i_node_step
+            j_node += j_node_step
+        end
+    end
+
+    return nothing
+end
+
 function calc_boundary_flux!(cache, t, boundary_condition::BC, boundary_indexing,
                              mesh::Union{P4estMesh{2}, T8codeMesh{2}},
                              equations, surface_integral, dg::DG) where {BC}
@@ -318,6 +348,43 @@ function calc_boundary_flux!(cache, t, boundary_condition::BC, boundary_indexing
     return nothing
 end
 
+function calc_boundary_flux!(cache, t, boundary_condition::BC, boundary_indexing,
+                             mesh::P4estMeshView{2},
+                             equations, surface_integral, dg::DG, u_global,
+                             semi) where {BC}
+    @unpack boundaries = cache
+    @unpack surface_flux_values = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for local_index in eachindex(boundary_indexing)
+        # Use the local index to get the global boundary index from the pre-sorted list
+        boundary = boundary_indexing[local_index]
+
+        # Get information on the adjacent element, compute the surface fluxes,
+        # and store them
+        element = boundaries.neighbor_ids[boundary]
+        node_indices = boundaries.node_indices[boundary]
+        direction = indices2direction(node_indices)
+
+        i_node_start, i_node_step = index_to_start_step_2d(node_indices[1], index_range)
+        j_node_start, j_node_step = index_to_start_step_2d(node_indices[2], index_range)
+
+        i_node = i_node_start
+        j_node = j_node_start
+        for node in eachnode(dg)
+            calc_boundary_flux!(surface_flux_values, t, boundary_condition,
+                                mesh, have_nonconservative_terms(equations),
+                                equations, surface_integral, dg, cache,
+                                i_node, j_node,
+                                node, direction, element, boundary,
+                                u_global, semi)
+
+            i_node += i_node_step
+            j_node += j_node_step
+        end
+    end
+end
+
 # inlined version of the boundary flux calculation along a physical interface
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
@@ -351,6 +418,71 @@ end
     return nothing
 end
 
+# inlined version of the boundary flux calculation along a physical interface
+# Currently this is only a dummy function to keep the test quiet.
+# TODO: Implement correct flux taking the global u vector into account.
+@inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
+                                     mesh::P4estMeshView{2},
+                                     nonconservative_terms::False, equations,
+                                     surface_integral, dg::DG, cache,
+                                     i_index, j_index,
+                                     node_index, direction_index, element_index,
+                                     boundary_index)
+    @unpack boundaries = cache
+    @unpack node_coordinates, contravariant_vectors = cache.elements
+    @unpack surface_flux = surface_integral
+
+    # Extract solution data from boundary container
+    u_inner = get_node_vars(boundaries.u, equations, dg, node_index, boundary_index)
+
+    # Outward-pointing normal direction (not normalized)
+    normal_direction = get_normal_direction(direction_index, contravariant_vectors,
+                                            i_index, j_index, element_index)
+
+    # Coordinates at boundary node
+    x = get_node_coords(node_coordinates, equations, dg,
+                        i_index, j_index, element_index)
+
+    # TODO: We need a proper flux calculation here.
+    flux_ = zeros(nvariables(equations))
+
+    # Copy flux to element storage in the correct orientation
+    for v in eachvariable(equations)
+        surface_flux_values[v, node_index, direction_index, element_index] = flux_[v]
+    end
+
+    return nothing
+end
+
+# inlined version of the boundary flux calculation along a physical interface
+@inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
+                                     mesh::P4estMeshView{2},
+                                     nonconservative_terms, equations,
+                                     surface_integral, dg::DG, cache,
+                                     i_index, j_index,
+                                     node_index, direction_index, element_index,
+                                     boundary_index, u_global, semi)
+    @unpack boundaries = cache
+    @unpack contravariant_vectors = cache.elements
+    @unpack surface_flux = surface_integral
+
+    # Extract solution data from boundary container
+    u_inner = get_node_vars(boundaries.u, equations, dg, node_index, boundary_index)
+
+    # Outward-pointing normal direction (not normalized)
+    normal_direction = get_normal_direction(direction_index, contravariant_vectors,
+                                            i_index, j_index, element_index)
+
+    flux_ = boundary_condition(u_inner, mesh, equations, cache, i_index, j_index,
+                               element_index, normal_direction, surface_flux,
+                               normal_direction, u_global, semi)
+
+    # Copy flux to element storage in the correct orientation
+    for v in eachvariable(equations)
+        surface_flux_values[v, node_index, direction_index, element_index] = flux_[v]
+    end
+end
+
 # inlined version of the boundary flux with nonconservative terms calculation along a physical interface
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
diff --git a/src/solvers/dgsem_p4est/dg_2d.jl.bkp b/src/solvers/dgsem_p4est/dg_2d.jl.bkp
new file mode 100644
index 00000000000..6a7bbbb71d7
--- /dev/null
+++ b/src/solvers/dgsem_p4est/dg_2d.jl.bkp
@@ -0,0 +1,679 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+# The methods below are specialized on the mortar type
+# and called from the basic `create_cache` method at the top.
+function create_cache(mesh::Union{P4estMesh{2}, T8codeMesh{2}}, equations,
+                      mortar_l2::LobattoLegendreMortarL2, uEltype)
+    # TODO: Taal performance using different types
+    MA2d = MArray{Tuple{nvariables(equations), nnodes(mortar_l2)},
+                  uEltype, 2,
+                  nvariables(equations) * nnodes(mortar_l2)}
+    fstar_primary_upper_threaded = MA2d[MA2d(undef) for _ in 1:Threads.nthreads()]
+    fstar_primary_lower_threaded = MA2d[MA2d(undef) for _ in 1:Threads.nthreads()]
+    fstar_secondary_upper_threaded = MA2d[MA2d(undef) for _ in 1:Threads.nthreads()]
+    fstar_secondary_lower_threaded = MA2d[MA2d(undef) for _ in 1:Threads.nthreads()]
+    u_threaded = MA2d[MA2d(undef) for _ in 1:Threads.nthreads()]
+
+    (; fstar_primary_upper_threaded, fstar_primary_lower_threaded,
+     fstar_secondary_upper_threaded, fstar_secondary_lower_threaded,
+     u_threaded)
+end
+
+#     index_to_start_step_2d(index::Symbol, index_range)
+#
+# Given a symbolic `index` and an `indexrange` (usually `eachnode(dg)`),
+# return `index_start, index_step`, i.e., a tuple containing
+# - `index_start`, an index value to begin a loop
+# - `index_step`,  an index step to update during a loop
+# The resulting indices translate surface indices to volume indices.
+#
+# !!! warning
+#     This assumes that loops using the return values are written as
+#
+#     i_volume_start, i_volume_step = index_to_start_step_2d(symbolic_index_i, index_range)
+#     j_volume_start, j_volume_step = index_to_start_step_2d(symbolic_index_j, index_range)
+#
+#     i_volume, j_volume = i_volume_start, j_volume_start
+#     for i_surface in index_range
+#       # do stuff with `i_surface` and `(i_volume, j_volume)`
+#
+#       i_volume += i_volume_step
+#       j_volume += j_volume_step
+#     end
+@inline function index_to_start_step_2d(index::Symbol, index_range)
+    index_begin = first(index_range)
+    index_end = last(index_range)
+
+    if index === :begin
+        return index_begin, 0
+    elseif index === :end
+        return index_end, 0
+    elseif index === :i_forward
+        return index_begin, 1
+    else # if index === :i_backward
+        return index_end, -1
+    end
+end
+
+# We pass the `surface_integral` argument solely for dispatch
+function prolong2interfaces!(cache, u,
+                             mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                             equations, surface_integral, dg::DG)
+    @unpack interfaces = cache
+    index_range = eachnode(dg)
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Copy solution data from the primary element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        # Note that in the current implementation, the interface will be
+        # "aligned at the primary element", i.e., the index of the primary side
+        # will always run forwards.
+        primary_element = interfaces.neighbor_ids[1, interface]
+        primary_indices = interfaces.node_indices[1, interface]
+
+        i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+                                                                 index_range)
+        j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+                                                                 index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                interfaces.u[1, v, i, interface] = u[v, i_primary, j_primary,
+                                                     primary_element]
+            end
+            i_primary += i_primary_step
+            j_primary += j_primary_step
+        end
+
+        # Copy solution data from the secondary element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        secondary_element = interfaces.neighbor_ids[2, interface]
+        secondary_indices = interfaces.node_indices[2, interface]
+
+        i_secondary_start, i_secondary_step = index_to_start_step_2d(secondary_indices[1],
+                                                                     index_range)
+        j_secondary_start, j_secondary_step = index_to_start_step_2d(secondary_indices[2],
+                                                                     index_range)
+
+        i_secondary = i_secondary_start
+        j_secondary = j_secondary_start
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                interfaces.u[2, v, i, interface] = u[v, i_secondary, j_secondary,
+                                                     secondary_element]
+            end
+            i_secondary += i_secondary_step
+            j_secondary += j_secondary_step
+        end
+    end
+
+    return nothing
+end
+
+function calc_interface_flux!(surface_flux_values,
+                              mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                              nonconservative_terms,
+                              equations, surface_integral, dg::DG, cache)
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+    index_end = last(index_range)
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Get element and side index information on the primary element
+        primary_element = neighbor_ids[1, interface]
+        primary_indices = node_indices[1, interface]
+        primary_direction = indices2direction(primary_indices)
+
+        # Create the local i,j indexing on the primary element used to pull normal direction information
+        i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+                                                                 index_range)
+        j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+                                                                 index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+
+        # Get element and side index information on the secondary element
+        secondary_element = neighbor_ids[2, interface]
+        secondary_indices = node_indices[2, interface]
+        secondary_direction = indices2direction(secondary_indices)
+
+        # Initiate the secondary index to be used in the surface for loop.
+        # This index on the primary side will always run forward but
+        # the secondary index might need to run backwards for flipped sides.
+        if :i_backward in secondary_indices
+            node_secondary = index_end
+            node_secondary_step = -1
+        else
+            node_secondary = 1
+            node_secondary_step = 1
+        end
+
+        for node in eachnode(dg)
+            # Get the normal direction on the primary element.
+            # Contravariant vectors at interfaces in negative coordinate direction
+            # are pointing inwards. This is handled by `get_normal_direction`.
+            normal_direction = get_normal_direction(primary_direction,
+                                                    contravariant_vectors,
+                                                    i_primary, j_primary,
+                                                    primary_element)
+
+            calc_interface_flux!(surface_flux_values, mesh, nonconservative_terms,
+                                 equations,
+                                 surface_integral, dg, cache,
+                                 interface, normal_direction,
+                                 node, primary_direction, primary_element,
+                                 node_secondary, secondary_direction, secondary_element)
+
+            # Increment primary element indices to pull the normal direction
+            i_primary += i_primary_step
+            j_primary += j_primary_step
+            # Increment the surface node index along the secondary element
+            node_secondary += node_secondary_step
+        end
+    end
+
+    return nothing
+end
+
+# Inlined version of the interface flux computation for conservation laws
+@inline function calc_interface_flux!(surface_flux_values,
+                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                      nonconservative_terms::False, equations,
+                                      surface_integral, dg::DG, cache,
+                                      interface_index, normal_direction,
+                                      primary_node_index, primary_direction_index,
+                                      primary_element_index,
+                                      secondary_node_index, secondary_direction_index,
+                                      secondary_element_index)
+    @unpack u = cache.interfaces
+    @unpack surface_flux = surface_integral
+
+    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+                                       interface_index)
+
+    flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
+
+    for v in eachvariable(equations)
+        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = flux_[v]
+        surface_flux_values[v, secondary_node_index, secondary_direction_index, secondary_element_index] = -flux_[v]
+    end
+end
+
+# Inlined version of the interface flux computation for equations with conservative and nonconservative terms
+@inline function calc_interface_flux!(surface_flux_values,
+                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                      nonconservative_terms::True, equations,
+                                      surface_integral, dg::DG, cache,
+                                      interface_index, normal_direction,
+                                      primary_node_index, primary_direction_index,
+                                      primary_element_index,
+                                      secondary_node_index, secondary_direction_index,
+                                      secondary_element_index)
+    @unpack u = cache.interfaces
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+
+    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+                                       interface_index)
+
+    flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
+
+    # Compute both nonconservative fluxes
+    noncons_primary = nonconservative_flux(u_ll, u_rr, normal_direction, equations)
+    noncons_secondary = nonconservative_flux(u_rr, u_ll, normal_direction, equations)
+
+    # Store the flux with nonconservative terms on the primary and secondary elements
+    for v in eachvariable(equations)
+        # Note the factor 0.5 necessary for the nonconservative fluxes based on
+        # the interpretation of global SBP operators coupled discontinuously via
+        # central fluxes/SATs
+        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = (flux_[v] +
+                                                                                                      0.5f0 *
+                                                                                                      noncons_primary[v])
+        surface_flux_values[v, secondary_node_index, secondary_direction_index, secondary_element_index] = -(flux_[v] +
+                                                                                                             0.5f0 *
+                                                                                                             noncons_secondary[v])
+    end
+end
+
+function prolong2boundaries!(cache, u,
+                             mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                             equations, surface_integral, dg::DG)
+    @unpack boundaries = cache
+    index_range = eachnode(dg)
+
+    @threaded for boundary in eachboundary(dg, cache)
+        # Copy solution data from the element using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        element = boundaries.neighbor_ids[boundary]
+        node_indices = boundaries.node_indices[boundary]
+
+        i_node_start, i_node_step = index_to_start_step_2d(node_indices[1], index_range)
+        j_node_start, j_node_step = index_to_start_step_2d(node_indices[2], index_range)
+
+        i_node = i_node_start
+        j_node = j_node_start
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                boundaries.u[v, i, boundary] = u[v, i_node, j_node, element]
+            end
+            i_node += i_node_step
+            j_node += j_node_step
+        end
+    end
+
+    return nothing
+end
+
+function calc_boundary_flux!(cache, t, boundary_condition::BC, boundary_indexing,
+                             mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                             equations, surface_integral, dg::DG) where {BC}
+    @unpack boundaries = cache
+    @unpack surface_flux_values = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for local_index in eachindex(boundary_indexing)
+        # Use the local index to get the global boundary index from the pre-sorted list
+        boundary = boundary_indexing[local_index]
+
+        # Get information on the adjacent element, compute the surface fluxes,
+        # and store them
+        element = boundaries.neighbor_ids[boundary]
+        node_indices = boundaries.node_indices[boundary]
+        direction = indices2direction(node_indices)
+
+        i_node_start, i_node_step = index_to_start_step_2d(node_indices[1], index_range)
+        j_node_start, j_node_step = index_to_start_step_2d(node_indices[2], index_range)
+
+        i_node = i_node_start
+        j_node = j_node_start
+        for node in eachnode(dg)
+            calc_boundary_flux!(surface_flux_values, t, boundary_condition,
+                                mesh, have_nonconservative_terms(equations),
+                                equations, surface_integral, dg, cache,
+                                i_node, j_node,
+                                node, direction, element, boundary)
+
+            i_node += i_node_step
+            j_node += j_node_step
+        end
+    end
+end
+
+# inlined version of the boundary flux calculation along a physical interface
+@inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
+                                     mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                     nonconservative_terms::False, equations,
+                                     surface_integral, dg::DG, cache,
+                                     i_index, j_index,
+                                     node_index, direction_index, element_index,
+                                     boundary_index)
+    @unpack boundaries = cache
+    @unpack node_coordinates, contravariant_vectors = cache.elements
+    @unpack surface_flux = surface_integral
+
+    # Extract solution data from boundary container
+    u_inner = get_node_vars(boundaries.u, equations, dg, node_index, boundary_index)
+
+    # Outward-pointing normal direction (not normalized)
+    normal_direction = get_normal_direction(direction_index, contravariant_vectors,
+                                            i_index, j_index, element_index)
+
+    # Coordinates at boundary node
+    x = get_node_coords(node_coordinates, equations, dg, i_index, j_index,
+                        element_index)
+
+    flux_ = boundary_condition(u_inner, normal_direction, x, t, surface_flux, equations)
+
+    # Copy flux to element storage in the correct orientation
+    for v in eachvariable(equations)
+        surface_flux_values[v, node_index, direction_index, element_index] = flux_[v]
+    end
+end
+
+# inlined version of the boundary flux with nonconservative terms calculation along a physical interface
+@inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
+                                     mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                     nonconservative_terms::True, equations,
+                                     surface_integral, dg::DG, cache,
+                                     i_index, j_index,
+                                     node_index, direction_index, element_index,
+                                     boundary_index)
+    @unpack boundaries = cache
+    @unpack node_coordinates, contravariant_vectors = cache.elements
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+
+    # Extract solution data from boundary container
+    u_inner = get_node_vars(boundaries.u, equations, dg, node_index, boundary_index)
+
+    # Outward-pointing normal direction (not normalized)
+    normal_direction = get_normal_direction(direction_index, contravariant_vectors,
+                                            i_index, j_index, element_index)
+
+    # Coordinates at boundary node
+    x = get_node_coords(node_coordinates, equations, dg, i_index, j_index,
+                        element_index)
+
+    # Call pointwise numerical flux function for the conservative part
+    # in the normal direction on the boundary
+    flux_ = boundary_condition(u_inner, normal_direction, x, t, surface_flux, equations)
+
+    # Compute pointwise nonconservative numerical flux at the boundary.
+    noncons_ = boundary_condition(u_inner, normal_direction, x, t, nonconservative_flux,
+                                  equations)
+
+    # Copy flux to element storage in the correct orientation
+    for v in eachvariable(equations)
+        # Note the factor 0.5 necessary for the nonconservative fluxes based on
+        # the interpretation of global SBP operators coupled discontinuously via
+        # central fluxes/SATs
+        surface_flux_values[v, node_index, direction_index, element_index] = flux_[v] +
+                                                                             0.5f0 *
+                                                                             noncons_[v]
+    end
+end
+
+function prolong2mortars!(cache, u,
+                          mesh::Union{P4estMesh{2}, T8codeMesh{2}}, equations,
+                          mortar_l2::LobattoLegendreMortarL2,
+                          dg::DGSEM)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    index_range = eachnode(dg)
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Copy solution data from the small elements using "delayed indexing" with
+        # a start value and a step size to get the correct face and orientation.
+        small_indices = node_indices[1, mortar]
+
+        i_small_start, i_small_step = index_to_start_step_2d(small_indices[1],
+                                                             index_range)
+        j_small_start, j_small_step = index_to_start_step_2d(small_indices[2],
+                                                             index_range)
+
+        for position in 1:2
+            i_small = i_small_start
+            j_small = j_small_start
+            element = neighbor_ids[position, mortar]
+            for i in eachnode(dg)
+                for v in eachvariable(equations)
+                    cache.mortars.u[1, v, position, i, mortar] = u[v, i_small, j_small,
+                                                                   element]
+                end
+                i_small += i_small_step
+                j_small += j_small_step
+            end
+        end
+
+        # Buffer to copy solution values of the large element in the correct orientation
+        # before interpolating
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # Copy solution of large element face to buffer in the
+        # correct orientation
+        large_indices = node_indices[2, mortar]
+
+        i_large_start, i_large_step = index_to_start_step_2d(large_indices[1],
+                                                             index_range)
+        j_large_start, j_large_step = index_to_start_step_2d(large_indices[2],
+                                                             index_range)
+
+        i_large = i_large_start
+        j_large = j_large_start
+        element = neighbor_ids[3, mortar]
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                u_buffer[v, i] = u[v, i_large, j_large, element]
+            end
+            i_large += i_large_step
+            j_large += j_large_step
+        end
+
+        # Interpolate large element face data from buffer to small face locations
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 1, :, mortar),
+                                mortar_l2.forward_lower,
+                                u_buffer)
+        multiply_dimensionwise!(view(cache.mortars.u, 2, :, 2, :, mortar),
+                                mortar_l2.forward_upper,
+                                u_buffer)
+    end
+
+    return nothing
+end
+
+function calc_mortar_flux!(surface_flux_values,
+                           mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                           nonconservative_terms, equations,
+                           mortar_l2::LobattoLegendreMortarL2,
+                           surface_integral, dg::DG, cache)
+    @unpack neighbor_ids, node_indices = cache.mortars
+    @unpack contravariant_vectors = cache.elements
+    @unpack (fstar_primary_upper_threaded, fstar_primary_lower_threaded,
+    fstar_secondary_upper_threaded, fstar_secondary_lower_threaded) = cache
+    index_range = eachnode(dg)
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Choose thread-specific pre-allocated container
+        fstar_primary = (fstar_primary_lower_threaded[Threads.threadid()],
+                         fstar_primary_upper_threaded[Threads.threadid()])
+
+        fstar_secondary = (fstar_secondary_lower_threaded[Threads.threadid()],
+                           fstar_secondary_upper_threaded[Threads.threadid()])
+
+        # Get index information on the small elements
+        small_indices = node_indices[1, mortar]
+        small_direction = indices2direction(small_indices)
+
+        i_small_start, i_small_step = index_to_start_step_2d(small_indices[1],
+                                                             index_range)
+        j_small_start, j_small_step = index_to_start_step_2d(small_indices[2],
+                                                             index_range)
+
+        for position in 1:2
+            i_small = i_small_start
+            j_small = j_small_start
+            element = neighbor_ids[position, mortar]
+            for node in eachnode(dg)
+                # Get the normal direction on the small element.
+                # Note, contravariant vectors at interfaces in negative coordinate direction
+                # are pointing inwards. This is handled by `get_normal_direction`.
+                normal_direction = get_normal_direction(small_direction,
+                                                        contravariant_vectors,
+                                                        i_small, j_small, element)
+
+                calc_mortar_flux!(fstar_primary, fstar_secondary,
+                                  mesh, nonconservative_terms, equations,
+                                  surface_integral, dg, cache,
+                                  mortar, position, normal_direction,
+                                  node)
+
+                i_small += i_small_step
+                j_small += j_small_step
+            end
+        end
+
+        # Buffer to interpolate flux values of the large element to before
+        # copying in the correct orientation
+        u_buffer = cache.u_threaded[Threads.threadid()]
+
+        # in calc_interface_flux!, the interface flux is computed once over each
+        # interface using the normal from the "primary" element. The result is then
+        # passed back to the "secondary" element, flipping the sign to account for the
+        # change in the normal direction. For mortars, this sign flip occurs in
+        # "mortar_fluxes_to_elements!" instead.
+        mortar_fluxes_to_elements!(surface_flux_values,
+                                   mesh, equations, mortar_l2, dg, cache,
+                                   mortar, fstar_primary, fstar_secondary,
+                                   u_buffer)
+    end
+
+    return nothing
+end
+
+# Inlined version of the mortar flux computation on small elements for conservation laws
+@inline function calc_mortar_flux!(fstar_primary, fstar_secondary,
+                                   mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                   nonconservative_terms::False, equations,
+                                   surface_integral, dg::DG, cache,
+                                   mortar_index, position_index, normal_direction,
+                                   node_index)
+    @unpack u = cache.mortars
+    @unpack surface_flux = surface_integral
+
+    u_ll, u_rr = get_surface_node_vars(u, equations, dg, position_index, node_index,
+                                       mortar_index)
+
+    flux = surface_flux(u_ll, u_rr, normal_direction, equations)
+
+    # Copy flux to buffer
+    set_node_vars!(fstar_primary[position_index], flux, equations, dg, node_index)
+    set_node_vars!(fstar_secondary[position_index], flux, equations, dg, node_index)
+end
+
+# Inlined version of the mortar flux computation on small elements for equations with conservative and
+# nonconservative terms
+@inline function calc_mortar_flux!(fstar_primary, fstar_secondary,
+                                   mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                   nonconservative_terms::True, equations,
+                                   surface_integral, dg::DG, cache,
+                                   mortar_index, position_index, normal_direction,
+                                   node_index)
+    @unpack u = cache.mortars
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+
+    u_ll, u_rr = get_surface_node_vars(u, equations, dg, position_index, node_index,
+                                       mortar_index)
+
+    # Compute conservative flux
+    flux = surface_flux(u_ll, u_rr, normal_direction, equations)
+
+    # Compute nonconservative flux and add it to the conservative flux.
+    # The nonconservative flux is scaled by a factor of 0.5 based on
+    # the interpretation of global SBP operators coupled discontinuously via
+    # central fluxes/SATs
+    noncons_primary = nonconservative_flux(u_ll, u_rr, normal_direction, equations)
+    noncons_secondary = nonconservative_flux(u_rr, u_ll, normal_direction, equations)
+
+    flux_plus_noncons_primary = flux + 0.5f0 * noncons_primary
+    flux_plus_noncons_secondary = flux + 0.5f0 * noncons_secondary
+
+    # Copy to buffer
+    set_node_vars!(fstar_primary[position_index], flux_plus_noncons_primary, equations,
+                   dg, node_index)
+    set_node_vars!(fstar_secondary[position_index], flux_plus_noncons_secondary,
+                   equations, dg, node_index)
+end
+
+@inline function mortar_fluxes_to_elements!(surface_flux_values,
+                                            mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                            equations,
+                                            mortar_l2::LobattoLegendreMortarL2,
+                                            dg::DGSEM, cache, mortar, fstar_primary,
+                                            fstar_secondary, u_buffer)
+    @unpack neighbor_ids, node_indices = cache.mortars
+
+    # Copy solution small to small
+    small_indices = node_indices[1, mortar]
+    small_direction = indices2direction(small_indices)
+
+    for position in 1:2
+        element = neighbor_ids[position, mortar]
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                surface_flux_values[v, i, small_direction, element] = fstar_primary[position][v,
+                                                                                              i]
+            end
+        end
+    end
+
+    # Project small fluxes to large element.
+    multiply_dimensionwise!(u_buffer,
+                            mortar_l2.reverse_upper, fstar_secondary[2],
+                            mortar_l2.reverse_lower, fstar_secondary[1])
+
+    # The flux is calculated in the outward direction of the small elements,
+    # so the sign must be switched to get the flux in outward direction
+    # of the large element.
+    # The contravariant vectors of the large element (and therefore the normal
+    # vectors of the large element as well) are twice as large as the
+    # contravariant vectors of the small elements. Therefore, the flux needs
+    # to be scaled by a factor of 2 to obtain the flux of the large element.
+    u_buffer .*= -2
+
+    # Copy interpolated flux values from buffer to large element face in the
+    # correct orientation.
+    # Note that the index of the small sides will always run forward but
+    # the index of the large side might need to run backwards for flipped sides.
+    large_element = neighbor_ids[3, mortar]
+    large_indices = node_indices[2, mortar]
+    large_direction = indices2direction(large_indices)
+
+    if :i_backward in large_indices
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                surface_flux_values[v, end + 1 - i, large_direction, large_element] = u_buffer[v,
+                                                                                               i]
+            end
+        end
+    else
+        for i in eachnode(dg)
+            for v in eachvariable(equations)
+                surface_flux_values[v, i, large_direction, large_element] = u_buffer[v,
+                                                                                     i]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function calc_surface_integral!(du, u,
+                                mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                equations,
+                                surface_integral::SurfaceIntegralWeakForm,
+                                dg::DGSEM, cache)
+    @unpack boundary_interpolation = dg.basis
+    @unpack surface_flux_values = cache.elements
+
+    # Note that all fluxes have been computed with outward-pointing normal vectors.
+    # Access the factors only once before beginning the loop to increase performance.
+    # We also use explicit assignments instead of `+=` to let `@muladd` turn these
+    # into FMAs (see comment at the top of the file).
+    factor_1 = boundary_interpolation[1, 1]
+    factor_2 = boundary_interpolation[nnodes(dg), 2]
+    @threaded for element in eachelement(dg, cache)
+        for l in eachnode(dg)
+            for v in eachvariable(equations)
+                # surface at -x
+                du[v, 1, l, element] = (du[v, 1, l, element] +
+                                        surface_flux_values[v, l, 1, element] *
+                                        factor_1)
+
+                # surface at +x
+                du[v, nnodes(dg), l, element] = (du[v, nnodes(dg), l, element] +
+                                                 surface_flux_values[v, l, 2, element] *
+                                                 factor_2)
+
+                # surface at -y
+                du[v, l, 1, element] = (du[v, l, 1, element] +
+                                        surface_flux_values[v, l, 3, element] *
+                                        factor_1)
+
+                # surface at +y
+                du[v, l, nnodes(dg), element] = (du[v, l, nnodes(dg), element] +
+                                                 surface_flux_values[v, l, 4, element] *
+                                                 factor_2)
+            end
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_tree/dg_2d.jl b/src/solvers/dgsem_tree/dg_2d.jl
index 9a370f37bcd..01fd03a2668 100644
--- a/src/solvers/dgsem_tree/dg_2d.jl
+++ b/src/solvers/dgsem_tree/dg_2d.jl
@@ -34,8 +34,19 @@ function create_cache(mesh::Union{TreeMesh{2}, TreeMesh{3}}, equations,
     return cache
 end
 
+# The methods below are specialized on the volume integral type
+# and called from the basic `create_cache` method at the top.
+function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
+                                  UnstructuredMesh2D,
+                                  P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
+                      equations, volume_integral::VolumeIntegralFluxDifferencing,
+                      dg::DG, uEltype)
+    NamedTuple()
+end
+
 function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, UnstructuredMesh2D,
-                                  P4estMesh{2}, T8codeMesh{2}}, equations,
+                                  P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
+                      equations,
                       volume_integral::VolumeIntegralShockCapturingHG, dg::DG, uEltype)
     cache = create_cache(mesh, equations,
                          VolumeIntegralFluxDifferencing(volume_integral.volume_flux_dg),
@@ -60,7 +71,8 @@ function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, UnstructuredMe
 end
 
 function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, UnstructuredMesh2D,
-                                  P4estMesh{2}, T8codeMesh{2}}, equations,
+                                  P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
+                      equations,
                       volume_integral::VolumeIntegralPureLGLFiniteVolume, dg::DG,
                       uEltype)
     A3dp1_x = Array{uEltype, 3}
@@ -83,8 +95,10 @@ end
 
 # The methods below are specialized on the mortar type
 # and called from the basic `create_cache` method at the top.
-function create_cache(mesh::TreeMesh{2}, equations,
-                      mortar_l2::LobattoLegendreMortarL2, uEltype)
+function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, UnstructuredMesh2D,
+                                  P4estMesh{2}, P4estMeshView{2}, P4estMeshView{2},
+                                  T8codeMesh{2}},
+                      equations, mortar_l2::LobattoLegendreMortarL2, uEltype)
     # TODO: Taal performance using different types
     MA2d = MArray{Tuple{nvariables(equations), nnodes(mortar_l2)},
                   uEltype, 2,
@@ -176,6 +190,92 @@ function rhs!(du, u, t,
     return nothing
 end
 
+function rhs!(du, u, t, u_global, semis,
+              mesh::P4estMeshView{2},
+              equations,
+              boundary_conditions, source_terms::Source,
+              dg::DG, cache) where {Source}
+    # Reset du
+    @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
+
+    # Calculate volume integral
+    @trixi_timeit timer() "volume integral" begin
+        calc_volume_integral!(du, u, mesh,
+                              have_nonconservative_terms(equations), equations,
+                              dg.volume_integral, dg, cache)
+    end
+
+    # Prolong solution to interfaces
+    @trixi_timeit timer() "prolong2interfaces" begin
+        prolong2interfaces!(cache, u, mesh, equations, dg)
+    end
+
+    # Calculate interface fluxes
+    @trixi_timeit timer() "interface flux" begin
+        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+                             have_nonconservative_terms(equations), equations,
+                             dg.surface_integral, dg, cache)
+    end
+
+    # Prolong solution to boundaries
+    @trixi_timeit timer() "prolong2boundaries" begin
+        prolong2boundaries!(cache, u, u_global, semis, mesh, equations,
+                            dg.surface_integral, dg)
+    end
+
+    # Calculate boundary fluxes
+    @trixi_timeit timer() "boundary flux" begin
+        calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
+                            dg.surface_integral, dg, u_global, semis)
+    end
+
+    # Prolong solution to mortars
+    @trixi_timeit timer() "prolong2mortars" begin
+        prolong2mortars!(cache, u, mesh, equations,
+                         dg.mortar, dg)
+    end
+
+    # Calculate mortar fluxes
+    @trixi_timeit timer() "mortar flux" begin
+        calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                          have_nonconservative_terms(equations), equations,
+                          dg.mortar, dg.surface_integral, dg, cache)
+    end
+
+    # Calculate surface integrals
+    @trixi_timeit timer() "surface integral" begin
+        calc_surface_integral!(du, u, mesh, equations,
+                               dg.surface_integral, dg, cache)
+    end
+
+    # Apply Jacobian from mapping to reference element
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+
+    # Calculate source terms
+    @trixi_timeit timer() "source terms" begin
+        calc_sources!(du, u, t, source_terms, equations, dg, cache)
+    end
+
+    return nothing
+end
+
+function calc_volume_integral!(du, u,
+                               mesh::Union{TreeMesh{2}, StructuredMesh{2},
+                                           StructuredMeshView{2}, UnstructuredMesh2D,
+                                           P4estMesh{2}, P4estMeshView{2},
+                                           T8codeMesh{2}},
+                               have_nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralWeakForm,
+                               dg::DGSEM, cache)
+    @threaded for element in eachelement(dg, cache)
+        weak_form_kernel!(du, u, element, mesh,
+                          have_nonconservative_terms, equations,
+                          dg, cache)
+    end
+
+    return nothing
+end
+
 #=
 `weak_form_kernel!` is only implemented for conserved terms as
 non-conservative terms should always be discretized in conjunction with a flux-splitting scheme,
@@ -211,6 +311,36 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
     return nothing
 end
 
+# flux differencing volume integral. For curved meshes averaging of the
+# mapping terms, stored in `cache.elements.contravariant_vectors`, is peeled apart
+# from the evaluation of the physical fluxes in each Cartesian direction
+function calc_volume_integral!(du, u,
+                               mesh::Union{TreeMesh{2}, StructuredMesh{2},
+                                           StructuredMeshView{2},
+                                           UnstructuredMesh2D, P4estMesh{2},
+                                           T8codeMesh{2}},
+                               nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralFluxDifferencing,
+                               dg::DGSEM, cache)
+    @threaded for element in eachelement(dg, cache)
+        flux_differencing_kernel!(du, u, element, mesh,
+                                  nonconservative_terms, equations,
+                                  volume_integral.volume_flux, dg, cache)
+    end
+end
+
+function calc_volume_integral!(du, u,
+                               mesh::P4estMeshView{2},
+                               nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralFluxDifferencing,
+                               dg::DGSEM, cache)
+    @threaded for element in eachelement(dg, cache)
+        flux_differencing_kernel!(du, u, element, mesh.parent,
+                                  nonconservative_terms, equations,
+                                  volume_integral.volume_flux, dg, cache)
+    end
+end
+
 @inline function flux_differencing_kernel!(du, u,
                                            element, mesh::TreeMesh{2},
                                            have_nonconservative_terms::False, equations,
diff --git a/src/solvers/dgsem_tree/dg_2d.jl.bkp b/src/solvers/dgsem_tree/dg_2d.jl.bkp
new file mode 100644
index 00000000000..98bd0889206
--- /dev/null
+++ b/src/solvers/dgsem_tree/dg_2d.jl.bkp
@@ -0,0 +1,1215 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+# everything related to a DG semidiscretization in 2D,
+# currently limited to Lobatto-Legendre nodes
+
+# This method is called when a SemidiscretizationHyperbolic is constructed.
+# It constructs the basic `cache` used throughout the simulation to compute
+# the RHS etc.
+function create_cache(mesh::TreeMesh{2}, equations,
+                      dg::DG, RealT, uEltype)
+    # Get cells for which an element needs to be created (i.e. all leaf cells)
+    leaf_cell_ids = local_leaf_cells(mesh.tree)
+
+    elements = init_elements(leaf_cell_ids, mesh, equations, dg.basis, RealT, uEltype)
+
+    interfaces = init_interfaces(leaf_cell_ids, mesh, elements)
+
+    boundaries = init_boundaries(leaf_cell_ids, mesh, elements)
+
+    mortars = init_mortars(leaf_cell_ids, mesh, elements, dg.mortar)
+
+    cache = (; elements, interfaces, boundaries, mortars)
+
+    # Add specialized parts of the cache required to compute the volume integral etc.
+    cache = (; cache...,
+             create_cache(mesh, equations, dg.volume_integral, dg, uEltype)...)
+    cache = (; cache..., create_cache(mesh, equations, dg.mortar, uEltype)...)
+
+    return cache
+end
+
+# The methods below are specialized on the volume integral type
+# and called from the basic `create_cache` method at the top.
+function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
+                                  UnstructuredMesh2D,
+                                  P4estMesh{2}, T8codeMesh{2}},
+                      equations, volume_integral::VolumeIntegralFluxDifferencing,
+                      dg::DG, uEltype)
+    NamedTuple()
+end
+
+function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, UnstructuredMesh2D,
+                                  P4estMesh{2}, T8codeMesh{2}}, equations,
+                      volume_integral::VolumeIntegralShockCapturingHG, dg::DG, uEltype)
+    cache = create_cache(mesh, equations,
+                         VolumeIntegralFluxDifferencing(volume_integral.volume_flux_dg),
+                         dg, uEltype)
+
+    A3dp1_x = Array{uEltype, 3}
+    A3dp1_y = Array{uEltype, 3}
+
+    fstar1_L_threaded = A3dp1_x[A3dp1_x(undef, nvariables(equations), nnodes(dg) + 1,
+                                        nnodes(dg)) for _ in 1:Threads.nthreads()]
+    fstar1_R_threaded = A3dp1_x[A3dp1_x(undef, nvariables(equations), nnodes(dg) + 1,
+                                        nnodes(dg)) for _ in 1:Threads.nthreads()]
+    fstar2_L_threaded = A3dp1_y[A3dp1_y(undef, nvariables(equations), nnodes(dg),
+                                        nnodes(dg) + 1) for _ in 1:Threads.nthreads()]
+    fstar2_R_threaded = A3dp1_y[A3dp1_y(undef, nvariables(equations), nnodes(dg),
+                                        nnodes(dg) + 1) for _ in 1:Threads.nthreads()]
+
+    return (; cache...,
+            fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded)
+end
+
+function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, UnstructuredMesh2D,
+                                  P4estMesh{2}, T8codeMesh{2}}, equations,
+                      volume_integral::VolumeIntegralPureLGLFiniteVolume, dg::DG,
+                      uEltype)
+    A3dp1_x = Array{uEltype, 3}
+    A3dp1_y = Array{uEltype, 3}
+
+    fstar1_L_threaded = A3dp1_x[A3dp1_x(undef, nvariables(equations), nnodes(dg) + 1,
+                                        nnodes(dg)) for _ in 1:Threads.nthreads()]
+    fstar1_R_threaded = A3dp1_x[A3dp1_x(undef, nvariables(equations), nnodes(dg) + 1,
+                                        nnodes(dg)) for _ in 1:Threads.nthreads()]
+    fstar2_L_threaded = A3dp1_y[A3dp1_y(undef, nvariables(equations), nnodes(dg),
+                                        nnodes(dg) + 1) for _ in 1:Threads.nthreads()]
+    fstar2_R_threaded = A3dp1_y[A3dp1_y(undef, nvariables(equations), nnodes(dg),
+                                        nnodes(dg) + 1) for _ in 1:Threads.nthreads()]
+
+    return (; fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded,
+            fstar2_R_threaded)
+end
+
+# The methods below are specialized on the mortar type
+# and called from the basic `create_cache` method at the top.
+function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, UnstructuredMesh2D,
+                                  P4estMesh{2}, T8codeMesh{2}},
+                      equations, mortar_l2::LobattoLegendreMortarL2, uEltype)
+    # TODO: Taal performance using different types
+    MA2d = MArray{Tuple{nvariables(equations), nnodes(mortar_l2)}, uEltype, 2,
+                  nvariables(equations) * nnodes(mortar_l2)}
+    fstar_primary_upper_threaded = MA2d[MA2d(undef) for _ in 1:Threads.nthreads()]
+    fstar_primary_lower_threaded = MA2d[MA2d(undef) for _ in 1:Threads.nthreads()]
+    fstar_secondary_upper_threaded = MA2d[MA2d(undef) for _ in 1:Threads.nthreads()]
+    fstar_secondary_lower_threaded = MA2d[MA2d(undef) for _ in 1:Threads.nthreads()]
+
+    # A2d = Array{uEltype, 2}
+    # fstar_upper_threaded = [A2d(undef, nvariables(equations), nnodes(mortar_l2)) for _ in 1:Threads.nthreads()]
+    # fstar_lower_threaded = [A2d(undef, nvariables(equations), nnodes(mortar_l2)) for _ in 1:Threads.nthreads()]
+
+    (; fstar_primary_upper_threaded, fstar_primary_lower_threaded,
+     fstar_secondary_upper_threaded, fstar_secondary_lower_threaded)
+end
+
+# TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
+
+function rhs!(du, u, t,
+              mesh::Union{TreeMesh{2}, P4estMesh{2}, T8codeMesh{2}}, equations,
+              boundary_conditions, source_terms::Source,
+              dg::DG, cache) where {Source}
+    # Reset du
+    @trixi_timeit timer() "reset ∂u/∂t" reset_du!(du, dg, cache)
+
+    # Calculate volume integral
+    @trixi_timeit timer() "volume integral" begin
+        calc_volume_integral!(du, u, mesh,
+                              have_nonconservative_terms(equations), equations,
+                              dg.volume_integral, dg, cache)
+    end
+
+    # Prolong solution to interfaces
+    @trixi_timeit timer() "prolong2interfaces" begin
+        prolong2interfaces!(cache, u, mesh, equations,
+                            dg.surface_integral, dg)
+    end
+
+    # Calculate interface fluxes
+    @trixi_timeit timer() "interface flux" begin
+        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+                             have_nonconservative_terms(equations), equations,
+                             dg.surface_integral, dg, cache)
+    end
+
+    # Prolong solution to boundaries
+    @trixi_timeit timer() "prolong2boundaries" begin
+        prolong2boundaries!(cache, u, mesh, equations,
+                            dg.surface_integral, dg)
+    end
+
+    # Calculate boundary fluxes
+    @trixi_timeit timer() "boundary flux" begin
+        calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
+                            dg.surface_integral, dg)
+    end
+
+    # Prolong solution to mortars
+    @trixi_timeit timer() "prolong2mortars" begin
+        prolong2mortars!(cache, u, mesh, equations,
+                         dg.mortar, dg)
+    end
+
+    # Calculate mortar fluxes
+    @trixi_timeit timer() "mortar flux" begin
+        calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                          have_nonconservative_terms(equations), equations,
+                          dg.mortar, dg.surface_integral, dg, cache)
+    end
+
+    # Calculate surface integrals
+    @trixi_timeit timer() "surface integral" begin
+        calc_surface_integral!(du, u, mesh, equations,
+                               dg.surface_integral, dg, cache)
+    end
+
+    # Apply Jacobian from mapping to reference element
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+
+    # Calculate source terms
+    @trixi_timeit timer() "source terms" begin
+        calc_sources!(du, u, t, source_terms, equations, dg, cache)
+    end
+
+    return nothing
+end
+
+function calc_volume_integral!(du, u,
+                               mesh::Union{TreeMesh{2}, StructuredMesh{2},
+                                           StructuredMeshView{2}, UnstructuredMesh2D,
+                                           P4estMesh{2}, T8codeMesh{2}},
+                               nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralWeakForm,
+                               dg::DGSEM, cache)
+    @threaded for element in eachelement(dg, cache)
+        weak_form_kernel!(du, u, element, mesh,
+                          nonconservative_terms, equations,
+                          dg, cache)
+    end
+
+    return nothing
+end
+
+#=
+`weak_form_kernel!` is only implemented for conserved terms as
+non-conservative terms should always be discretized in conjunction with a flux-splitting scheme,
+see `flux_differencing_kernel!`.
+This treatment is required to achieve, e.g., entropy-stability or well-balancedness.
+See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-1765644064
+=#
+@inline function weak_form_kernel!(du, u,
+                                   element, mesh::TreeMesh{2},
+                                   nonconservative_terms::False, equations,
+                                   dg::DGSEM, cache, alpha = true)
+    # true * [some floating point value] == [exactly the same floating point value]
+    # This can (hopefully) be optimized away due to constant propagation.
+    @unpack derivative_dhat = dg.basis
+
+    # Calculate volume terms in one element
+    for j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, element)
+
+        flux1 = flux(u_node, 1, equations)
+        for ii in eachnode(dg)
+            multiply_add_to_node_vars!(du, alpha * derivative_dhat[ii, i], flux1,
+                                       equations, dg, ii, j, element)
+        end
+
+        flux2 = flux(u_node, 2, equations)
+        for jj in eachnode(dg)
+            multiply_add_to_node_vars!(du, alpha * derivative_dhat[jj, j], flux2,
+                                       equations, dg, i, jj, element)
+        end
+    end
+
+    return nothing
+end
+
+# flux differencing volume integral. For curved meshes averaging of the
+# mapping terms, stored in `cache.elements.contravariant_vectors`, is peeled apart
+# from the evaluation of the physical fluxes in each Cartesian direction
+function calc_volume_integral!(du, u,
+                               mesh::Union{TreeMesh{2}, StructuredMesh{2},
+                                           StructuredMeshView{2},
+                                           UnstructuredMesh2D, P4estMesh{2},
+                                           T8codeMesh{2}},
+                               nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralFluxDifferencing,
+                               dg::DGSEM, cache)
+    @threaded for element in eachelement(dg, cache)
+        flux_differencing_kernel!(du, u, element, mesh,
+                                  nonconservative_terms, equations,
+                                  volume_integral.volume_flux, dg, cache)
+    end
+end
+
+@inline function flux_differencing_kernel!(du, u,
+                                           element, mesh::TreeMesh{2},
+                                           nonconservative_terms::False, equations,
+                                           volume_flux, dg::DGSEM, cache, alpha = true)
+    # true * [some floating point value] == [exactly the same floating point value]
+    # This can (hopefully) be optimized away due to constant propagation.
+    @unpack derivative_split = dg.basis
+
+    # Calculate volume integral in one element
+    for j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, element)
+
+        # All diagonal entries of `derivative_split` are zero. Thus, we can skip
+        # the computation of the diagonal terms. In addition, we use the symmetry
+        # of the `volume_flux` to save half of the possible two-point flux
+        # computations.
+
+        # x direction
+        for ii in (i + 1):nnodes(dg)
+            u_node_ii = get_node_vars(u, equations, dg, ii, j, element)
+            flux1 = volume_flux(u_node, u_node_ii, 1, equations)
+            multiply_add_to_node_vars!(du, alpha * derivative_split[i, ii], flux1,
+                                       equations, dg, i, j, element)
+            multiply_add_to_node_vars!(du, alpha * derivative_split[ii, i], flux1,
+                                       equations, dg, ii, j, element)
+        end
+
+        # y direction
+        for jj in (j + 1):nnodes(dg)
+            u_node_jj = get_node_vars(u, equations, dg, i, jj, element)
+            flux2 = volume_flux(u_node, u_node_jj, 2, equations)
+            multiply_add_to_node_vars!(du, alpha * derivative_split[j, jj], flux2,
+                                       equations, dg, i, j, element)
+            multiply_add_to_node_vars!(du, alpha * derivative_split[jj, j], flux2,
+                                       equations, dg, i, jj, element)
+        end
+    end
+end
+
+@inline function flux_differencing_kernel!(du, u,
+                                           element, mesh::TreeMesh{2},
+                                           nonconservative_terms::True, equations,
+                                           volume_flux, dg::DGSEM, cache, alpha = true)
+    # true * [some floating point value] == [exactly the same floating point value]
+    # This can (hopefully) be optimized away due to constant propagation.
+    @unpack derivative_split = dg.basis
+    symmetric_flux, nonconservative_flux = volume_flux
+
+    # Apply the symmetric flux as usual
+    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+                              dg, cache, alpha)
+
+    # Calculate the remaining volume terms using the nonsymmetric generalized flux
+    for j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, element)
+
+        # The diagonal terms are zero since the diagonal of `derivative_split`
+        # is zero. We ignore this for now.
+
+        # x direction
+        integral_contribution = zero(u_node)
+        for ii in eachnode(dg)
+            u_node_ii = get_node_vars(u, equations, dg, ii, j, element)
+            noncons_flux1 = nonconservative_flux(u_node, u_node_ii, 1, equations)
+            integral_contribution = integral_contribution +
+                                    derivative_split[i, ii] * noncons_flux1
+        end
+
+        # y direction
+        for jj in eachnode(dg)
+            u_node_jj = get_node_vars(u, equations, dg, i, jj, element)
+            noncons_flux2 = nonconservative_flux(u_node, u_node_jj, 2, equations)
+            integral_contribution = integral_contribution +
+                                    derivative_split[j, jj] * noncons_flux2
+        end
+
+        # The factor 0.5 cancels the factor 2 in the flux differencing form
+        multiply_add_to_node_vars!(du, alpha * 0.5f0, integral_contribution, equations,
+                                   dg, i, j, element)
+    end
+end
+
+# TODO: Taal dimension agnostic
+function calc_volume_integral!(du, u,
+                               mesh::Union{TreeMesh{2}, StructuredMesh{2},
+                                           UnstructuredMesh2D, P4estMesh{2},
+                                           T8codeMesh{2}},
+                               nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralShockCapturingHG,
+                               dg::DGSEM, cache)
+    @unpack volume_flux_dg, volume_flux_fv, indicator = volume_integral
+
+    # Calculate blending factors α: u = u_DG * (1 - α) + u_FV * α
+    alpha = @trixi_timeit timer() "blending factors" indicator(u, mesh, equations, dg,
+                                                               cache)
+
+    # For `Float64`, this gives 1.8189894035458565e-12
+    # For `Float32`, this gives 1.1920929f-5
+    RealT = eltype(alpha)
+    atol = max(100 * eps(RealT), eps(RealT)^convert(RealT, 0.75f0))
+    @threaded for element in eachelement(dg, cache)
+        alpha_element = alpha[element]
+        # Clip blending factor for values close to zero (-> pure DG)
+        dg_only = isapprox(alpha_element, 0, atol = atol)
+
+        if dg_only
+            flux_differencing_kernel!(du, u, element, mesh,
+                                      nonconservative_terms, equations,
+                                      volume_flux_dg, dg, cache)
+        else
+            # Calculate DG volume integral contribution
+            flux_differencing_kernel!(du, u, element, mesh,
+                                      nonconservative_terms, equations,
+                                      volume_flux_dg, dg, cache, 1 - alpha_element)
+
+            # Calculate FV volume integral contribution
+            fv_kernel!(du, u, mesh, nonconservative_terms, equations, volume_flux_fv,
+                       dg, cache, element, alpha_element)
+        end
+    end
+
+    return nothing
+end
+
+# TODO: Taal dimension agnostic
+function calc_volume_integral!(du, u,
+                               mesh::Union{TreeMesh{2}, StructuredMesh{2},
+                                           UnstructuredMesh2D, P4estMesh{2},
+                                           T8codeMesh{2}},
+                               nonconservative_terms, equations,
+                               volume_integral::VolumeIntegralPureLGLFiniteVolume,
+                               dg::DGSEM, cache)
+    @unpack volume_flux_fv = volume_integral
+
+    # Calculate LGL FV volume integral
+    @threaded for element in eachelement(dg, cache)
+        fv_kernel!(du, u, mesh, nonconservative_terms, equations, volume_flux_fv,
+                   dg, cache, element, true)
+    end
+
+    return nothing
+end
+
+@inline function fv_kernel!(du, u,
+                            mesh::Union{TreeMesh{2}, StructuredMesh{2},
+                                        UnstructuredMesh2D, P4estMesh{2},
+                                        T8codeMesh{2}},
+                            nonconservative_terms, equations,
+                            volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
+    @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded = cache
+    @unpack inverse_weights = dg.basis
+
+    # Calculate FV two-point fluxes
+    fstar1_L = fstar1_L_threaded[Threads.threadid()]
+    fstar2_L = fstar2_L_threaded[Threads.threadid()]
+    fstar1_R = fstar1_R_threaded[Threads.threadid()]
+    fstar2_R = fstar2_R_threaded[Threads.threadid()]
+    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, mesh,
+                 nonconservative_terms, equations, volume_flux_fv, dg, element, cache)
+
+    # Calculate FV volume integral contribution
+    for j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            du[v, i, j, element] += (alpha *
+                                     (inverse_weights[i] *
+                                      (fstar1_L[v, i + 1, j] - fstar1_R[v, i, j]) +
+                                      inverse_weights[j] *
+                                      (fstar2_L[v, i, j + 1] - fstar2_R[v, i, j])))
+        end
+    end
+
+    return nothing
+end
+
+#     calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u_leftright,
+#                  nonconservative_terms::False, equations,
+#                  volume_flux_fv, dg, element)
+#
+# Calculate the finite volume fluxes inside the elements (**without non-conservative terms**).
+#
+# # Arguments
+# - `fstar1_L::AbstractArray{<:Real, 3}`
+# - `fstar1_R::AbstractArray{<:Real, 3}`
+# - `fstar2_L::AbstractArray{<:Real, 3}`
+# - `fstar2_R::AbstractArray{<:Real, 3}`
+@inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
+                              u::AbstractArray{<:Any, 4},
+                              mesh::TreeMesh{2}, nonconservative_terms::False,
+                              equations,
+                              volume_flux_fv, dg::DGSEM, element, cache)
+    fstar1_L[:, 1, :] .= zero(eltype(fstar1_L))
+    fstar1_L[:, nnodes(dg) + 1, :] .= zero(eltype(fstar1_L))
+    fstar1_R[:, 1, :] .= zero(eltype(fstar1_R))
+    fstar1_R[:, nnodes(dg) + 1, :] .= zero(eltype(fstar1_R))
+
+    for j in eachnode(dg), i in 2:nnodes(dg)
+        u_ll = get_node_vars(u, equations, dg, i - 1, j, element)
+        u_rr = get_node_vars(u, equations, dg, i, j, element)
+        flux = volume_flux_fv(u_ll, u_rr, 1, equations) # orientation 1: x direction
+        set_node_vars!(fstar1_L, flux, equations, dg, i, j)
+        set_node_vars!(fstar1_R, flux, equations, dg, i, j)
+    end
+
+    fstar2_L[:, :, 1] .= zero(eltype(fstar2_L))
+    fstar2_L[:, :, nnodes(dg) + 1] .= zero(eltype(fstar2_L))
+    fstar2_R[:, :, 1] .= zero(eltype(fstar2_R))
+    fstar2_R[:, :, nnodes(dg) + 1] .= zero(eltype(fstar2_R))
+
+    for j in 2:nnodes(dg), i in eachnode(dg)
+        u_ll = get_node_vars(u, equations, dg, i, j - 1, element)
+        u_rr = get_node_vars(u, equations, dg, i, j, element)
+        flux = volume_flux_fv(u_ll, u_rr, 2, equations) # orientation 2: y direction
+        set_node_vars!(fstar2_L, flux, equations, dg, i, j)
+        set_node_vars!(fstar2_R, flux, equations, dg, i, j)
+    end
+
+    return nothing
+end
+
+#     calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u_leftright,
+#                  nonconservative_terms::True, equations,
+#                  volume_flux_fv, dg, element)
+#
+# Calculate the finite volume fluxes inside the elements (**with non-conservative terms**).
+#
+# # Arguments
+# - `fstar1_L::AbstractArray{<:Real, 3}`:
+# - `fstar1_R::AbstractArray{<:Real, 3}`:
+# - `fstar2_L::AbstractArray{<:Real, 3}`:
+# - `fstar2_R::AbstractArray{<:Real, 3}`:
+# - `u_leftright::AbstractArray{<:Real, 4}`
+@inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
+                              u::AbstractArray{<:Any, 4},
+                              mesh::TreeMesh{2}, nonconservative_terms::True, equations,
+                              volume_flux_fv, dg::DGSEM, element, cache)
+    volume_flux, nonconservative_flux = volume_flux_fv
+
+    # Fluxes in x
+    fstar1_L[:, 1, :] .= zero(eltype(fstar1_L))
+    fstar1_L[:, nnodes(dg) + 1, :] .= zero(eltype(fstar1_L))
+    fstar1_R[:, 1, :] .= zero(eltype(fstar1_R))
+    fstar1_R[:, nnodes(dg) + 1, :] .= zero(eltype(fstar1_R))
+
+    for j in eachnode(dg), i in 2:nnodes(dg)
+        u_ll = get_node_vars(u, equations, dg, i - 1, j, element)
+        u_rr = get_node_vars(u, equations, dg, i, j, element)
+
+        # Compute conservative part
+        f1 = volume_flux(u_ll, u_rr, 1, equations) # orientation 1: x direction
+
+        # Compute nonconservative part
+        # Note the factor 0.5 necessary for the nonconservative fluxes based on
+        # the interpretation of global SBP operators coupled discontinuously via
+        # central fluxes/SATs
+        f1_L = f1 + 0.5f0 * nonconservative_flux(u_ll, u_rr, 1, equations)
+        f1_R = f1 + 0.5f0 * nonconservative_flux(u_rr, u_ll, 1, equations)
+
+        # Copy to temporary storage
+        set_node_vars!(fstar1_L, f1_L, equations, dg, i, j)
+        set_node_vars!(fstar1_R, f1_R, equations, dg, i, j)
+    end
+
+    # Fluxes in y
+    fstar2_L[:, :, 1] .= zero(eltype(fstar2_L))
+    fstar2_L[:, :, nnodes(dg) + 1] .= zero(eltype(fstar2_L))
+    fstar2_R[:, :, 1] .= zero(eltype(fstar2_R))
+    fstar2_R[:, :, nnodes(dg) + 1] .= zero(eltype(fstar2_R))
+
+    # Compute inner fluxes
+    for j in 2:nnodes(dg), i in eachnode(dg)
+        u_ll = get_node_vars(u, equations, dg, i, j - 1, element)
+        u_rr = get_node_vars(u, equations, dg, i, j, element)
+
+        # Compute conservative part
+        f2 = volume_flux(u_ll, u_rr, 2, equations) # orientation 2: y direction
+
+        # Compute nonconservative part
+        # Note the factor 0.5 necessary for the nonconservative fluxes based on
+        # the interpretation of global SBP operators coupled discontinuously via
+        # central fluxes/SATs
+        f2_L = f2 + 0.5f0 * nonconservative_flux(u_ll, u_rr, 2, equations)
+        f2_R = f2 + 0.5f0 * nonconservative_flux(u_rr, u_ll, 2, equations)
+
+        # Copy to temporary storage
+        set_node_vars!(fstar2_L, f2_L, equations, dg, i, j)
+        set_node_vars!(fstar2_R, f2_R, equations, dg, i, j)
+    end
+
+    return nothing
+end
+
+function prolong2interfaces!(cache, u,
+                             mesh::TreeMesh{2}, equations, surface_integral, dg::DG)
+    @unpack interfaces = cache
+    @unpack orientations, neighbor_ids = interfaces
+    interfaces_u = interfaces.u
+
+    @threaded for interface in eachinterface(dg, cache)
+        left_element = neighbor_ids[1, interface]
+        right_element = neighbor_ids[2, interface]
+
+        if orientations[interface] == 1
+            # interface in x-direction
+            for j in eachnode(dg), v in eachvariable(equations)
+                interfaces_u[1, v, j, interface] = u[v, nnodes(dg), j, left_element]
+                interfaces_u[2, v, j, interface] = u[v, 1, j, right_element]
+            end
+        else # if orientations[interface] == 2
+            # interface in y-direction
+            for i in eachnode(dg), v in eachvariable(equations)
+                interfaces_u[1, v, i, interface] = u[v, i, nnodes(dg), left_element]
+                interfaces_u[2, v, i, interface] = u[v, i, 1, right_element]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function calc_interface_flux!(surface_flux_values,
+                              mesh::TreeMesh{2},
+                              nonconservative_terms::False, equations,
+                              surface_integral, dg::DG, cache)
+    @unpack surface_flux = surface_integral
+    @unpack u, neighbor_ids, orientations = cache.interfaces
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Get neighboring elements
+        left_id = neighbor_ids[1, interface]
+        right_id = neighbor_ids[2, interface]
+
+        # Determine interface direction with respect to elements:
+        # orientation = 1: left -> 2, right -> 1
+        # orientation = 2: left -> 4, right -> 3
+        left_direction = 2 * orientations[interface]
+        right_direction = 2 * orientations[interface] - 1
+
+        for i in eachnode(dg)
+            # Call pointwise Riemann solver
+            u_ll, u_rr = get_surface_node_vars(u, equations, dg, i, interface)
+            flux = surface_flux(u_ll, u_rr, orientations[interface], equations)
+
+            # Copy flux to left and right element storage
+            for v in eachvariable(equations)
+                surface_flux_values[v, i, left_direction, left_id] = flux[v]
+                surface_flux_values[v, i, right_direction, right_id] = flux[v]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function calc_interface_flux!(surface_flux_values,
+                              mesh::TreeMesh{2},
+                              nonconservative_terms::True, equations,
+                              surface_integral, dg::DG, cache)
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+    @unpack u, neighbor_ids, orientations = cache.interfaces
+
+    @threaded for interface in eachinterface(dg, cache)
+        # Get neighboring elements
+        left_id = neighbor_ids[1, interface]
+        right_id = neighbor_ids[2, interface]
+
+        # Determine interface direction with respect to elements:
+        # orientation = 1: left -> 2, right -> 1
+        # orientation = 2: left -> 4, right -> 3
+        left_direction = 2 * orientations[interface]
+        right_direction = 2 * orientations[interface] - 1
+
+        for i in eachnode(dg)
+            # Call pointwise Riemann solver
+            orientation = orientations[interface]
+            u_ll, u_rr = get_surface_node_vars(u, equations, dg, i, interface)
+            flux = surface_flux(u_ll, u_rr, orientation, equations)
+
+            # Compute both nonconservative fluxes
+            noncons_left = nonconservative_flux(u_ll, u_rr, orientation, equations)
+            noncons_right = nonconservative_flux(u_rr, u_ll, orientation, equations)
+
+            # Copy flux to left and right element storage
+            for v in eachvariable(equations)
+                # Note the factor 0.5 necessary for the nonconservative fluxes based on
+                # the interpretation of global SBP operators coupled discontinuously via
+                # central fluxes/SATs
+                surface_flux_values[v, i, left_direction, left_id] = flux[v] +
+                                                                     0.5f0 *
+                                                                     noncons_left[v]
+                surface_flux_values[v, i, right_direction, right_id] = flux[v] +
+                                                                       0.5f0 *
+                                                                       noncons_right[v]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function prolong2boundaries!(cache, u,
+                             mesh::TreeMesh{2}, equations, surface_integral, dg::DG)
+    @unpack boundaries = cache
+    @unpack orientations, neighbor_sides = boundaries
+
+    @threaded for boundary in eachboundary(dg, cache)
+        element = boundaries.neighbor_ids[boundary]
+
+        if orientations[boundary] == 1
+            # boundary in x-direction
+            if neighbor_sides[boundary] == 1
+                # element in -x direction of boundary
+                for l in eachnode(dg), v in eachvariable(equations)
+                    boundaries.u[1, v, l, boundary] = u[v, nnodes(dg), l, element]
+                end
+            else # Element in +x direction of boundary
+                for l in eachnode(dg), v in eachvariable(equations)
+                    boundaries.u[2, v, l, boundary] = u[v, 1, l, element]
+                end
+            end
+        else # if orientations[boundary] == 2
+            # boundary in y-direction
+            if neighbor_sides[boundary] == 1
+                # element in -y direction of boundary
+                for l in eachnode(dg), v in eachvariable(equations)
+                    boundaries.u[1, v, l, boundary] = u[v, l, nnodes(dg), element]
+                end
+            else
+                # element in +y direction of boundary
+                for l in eachnode(dg), v in eachvariable(equations)
+                    boundaries.u[2, v, l, boundary] = u[v, l, 1, element]
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
+# TODO: Taal dimension agnostic
+function calc_boundary_flux!(cache, t, boundary_condition::BoundaryConditionPeriodic,
+                             mesh::TreeMesh{2}, equations, surface_integral, dg::DG)
+    @assert isempty(eachboundary(dg, cache))
+end
+
+function calc_boundary_flux!(cache, t, boundary_conditions::NamedTuple,
+                             mesh::TreeMesh{2}, equations, surface_integral, dg::DG)
+    @unpack surface_flux_values = cache.elements
+    @unpack n_boundaries_per_direction = cache.boundaries
+
+    # Calculate indices
+    lasts = accumulate(+, n_boundaries_per_direction)
+    firsts = lasts - n_boundaries_per_direction .+ 1
+
+    # Calc boundary fluxes in each direction
+    calc_boundary_flux_by_direction!(surface_flux_values, t, boundary_conditions[1],
+                                     have_nonconservative_terms(equations),
+                                     equations, surface_integral, dg, cache,
+                                     1, firsts[1], lasts[1])
+    calc_boundary_flux_by_direction!(surface_flux_values, t, boundary_conditions[2],
+                                     have_nonconservative_terms(equations),
+                                     equations, surface_integral, dg, cache,
+                                     2, firsts[2], lasts[2])
+    calc_boundary_flux_by_direction!(surface_flux_values, t, boundary_conditions[3],
+                                     have_nonconservative_terms(equations),
+                                     equations, surface_integral, dg, cache,
+                                     3, firsts[3], lasts[3])
+    calc_boundary_flux_by_direction!(surface_flux_values, t, boundary_conditions[4],
+                                     have_nonconservative_terms(equations),
+                                     equations, surface_integral, dg, cache,
+                                     4, firsts[4], lasts[4])
+end
+
+function calc_boundary_flux_by_direction!(surface_flux_values::AbstractArray{<:Any, 4},
+                                          t,
+                                          boundary_condition,
+                                          nonconservative_terms::False, equations,
+                                          surface_integral, dg::DG, cache,
+                                          direction, first_boundary, last_boundary)
+    @unpack surface_flux = surface_integral
+    @unpack u, neighbor_ids, neighbor_sides, node_coordinates, orientations = cache.boundaries
+
+    @threaded for boundary in first_boundary:last_boundary
+        # Get neighboring element
+        neighbor = neighbor_ids[boundary]
+
+        for i in eachnode(dg)
+            # Get boundary flux
+            u_ll, u_rr = get_surface_node_vars(u, equations, dg, i, boundary)
+            if neighbor_sides[boundary] == 1 # Element is on the left, boundary on the right
+                u_inner = u_ll
+            else # Element is on the right, boundary on the left
+                u_inner = u_rr
+            end
+            x = get_node_coords(node_coordinates, equations, dg, i, boundary)
+            flux = boundary_condition(u_inner, orientations[boundary], direction, x, t,
+                                      surface_flux,
+                                      equations)
+
+            # Copy flux to left and right element storage
+            for v in eachvariable(equations)
+                surface_flux_values[v, i, direction, neighbor] = flux[v]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function calc_boundary_flux_by_direction!(surface_flux_values::AbstractArray{<:Any, 4},
+                                          t,
+                                          boundary_condition,
+                                          nonconservative_terms::True, equations,
+                                          surface_integral, dg::DG, cache,
+                                          direction, first_boundary, last_boundary)
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+    @unpack u, neighbor_ids, neighbor_sides, node_coordinates, orientations = cache.boundaries
+
+    @threaded for boundary in first_boundary:last_boundary
+        # Get neighboring element
+        neighbor = neighbor_ids[boundary]
+
+        for i in eachnode(dg)
+            # Get boundary flux
+            u_ll, u_rr = get_surface_node_vars(u, equations, dg, i, boundary)
+            if neighbor_sides[boundary] == 1 # Element is on the left, boundary on the right
+                u_inner = u_ll
+            else # Element is on the right, boundary on the left
+                u_inner = u_rr
+            end
+            x = get_node_coords(node_coordinates, equations, dg, i, boundary)
+            flux = boundary_condition(u_inner, orientations[boundary], direction, x, t,
+                                      surface_flux,
+                                      equations)
+            noncons_flux = boundary_condition(u_inner, orientations[boundary],
+                                              direction, x, t, nonconservative_flux,
+                                              equations)
+
+            # Copy flux to left and right element storage
+            for v in eachvariable(equations)
+                surface_flux_values[v, i, direction, neighbor] = flux[v] +
+                                                                 0.5f0 * noncons_flux[v]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function prolong2mortars!(cache, u,
+                          mesh::TreeMesh{2}, equations,
+                          mortar_l2::LobattoLegendreMortarL2,
+                          dg::DGSEM)
+    @threaded for mortar in eachmortar(dg, cache)
+        large_element = cache.mortars.neighbor_ids[3, mortar]
+        upper_element = cache.mortars.neighbor_ids[2, mortar]
+        lower_element = cache.mortars.neighbor_ids[1, mortar]
+
+        # Copy solution small to small
+        if cache.mortars.large_sides[mortar] == 1 # -> small elements on right side
+            if cache.mortars.orientations[mortar] == 1
+                # L2 mortars in x-direction
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        cache.mortars.u_upper[2, v, l, mortar] = u[v, 1, l,
+                                                                   upper_element]
+                        cache.mortars.u_lower[2, v, l, mortar] = u[v, 1, l,
+                                                                   lower_element]
+                    end
+                end
+            else
+                # L2 mortars in y-direction
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        cache.mortars.u_upper[2, v, l, mortar] = u[v, l, 1,
+                                                                   upper_element]
+                        cache.mortars.u_lower[2, v, l, mortar] = u[v, l, 1,
+                                                                   lower_element]
+                    end
+                end
+            end
+        else # large_sides[mortar] == 2 -> small elements on left side
+            if cache.mortars.orientations[mortar] == 1
+                # L2 mortars in x-direction
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        cache.mortars.u_upper[1, v, l, mortar] = u[v, nnodes(dg), l,
+                                                                   upper_element]
+                        cache.mortars.u_lower[1, v, l, mortar] = u[v, nnodes(dg), l,
+                                                                   lower_element]
+                    end
+                end
+            else
+                # L2 mortars in y-direction
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        cache.mortars.u_upper[1, v, l, mortar] = u[v, l, nnodes(dg),
+                                                                   upper_element]
+                        cache.mortars.u_lower[1, v, l, mortar] = u[v, l, nnodes(dg),
+                                                                   lower_element]
+                    end
+                end
+            end
+        end
+
+        # Interpolate large element face data to small interface locations
+        if cache.mortars.large_sides[mortar] == 1 # -> large element on left side
+            leftright = 1
+            if cache.mortars.orientations[mortar] == 1
+                # L2 mortars in x-direction
+                u_large = view(u, :, nnodes(dg), :, large_element)
+                element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
+                                              mortar, u_large)
+            else
+                # L2 mortars in y-direction
+                u_large = view(u, :, :, nnodes(dg), large_element)
+                element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
+                                              mortar, u_large)
+            end
+        else # large_sides[mortar] == 2 -> large element on right side
+            leftright = 2
+            if cache.mortars.orientations[mortar] == 1
+                # L2 mortars in x-direction
+                u_large = view(u, :, 1, :, large_element)
+                element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
+                                              mortar, u_large)
+            else
+                # L2 mortars in y-direction
+                u_large = view(u, :, :, 1, large_element)
+                element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
+                                              mortar, u_large)
+            end
+        end
+    end
+
+    return nothing
+end
+
+@inline function element_solutions_to_mortars!(mortars,
+                                               mortar_l2::LobattoLegendreMortarL2,
+                                               leftright, mortar,
+                                               u_large::AbstractArray{<:Any, 2})
+    multiply_dimensionwise!(view(mortars.u_upper, leftright, :, :, mortar),
+                            mortar_l2.forward_upper, u_large)
+    multiply_dimensionwise!(view(mortars.u_lower, leftright, :, :, mortar),
+                            mortar_l2.forward_lower, u_large)
+    return nothing
+end
+
+function calc_mortar_flux!(surface_flux_values,
+                           mesh::TreeMesh{2},
+                           nonconservative_terms::False, equations,
+                           mortar_l2::LobattoLegendreMortarL2,
+                           surface_integral, dg::DG, cache)
+    @unpack surface_flux = surface_integral
+    @unpack u_lower, u_upper, orientations = cache.mortars
+    @unpack (fstar_primary_upper_threaded, fstar_primary_lower_threaded,
+    fstar_secondary_upper_threaded, fstar_secondary_lower_threaded) = cache
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Choose thread-specific pre-allocated container
+        fstar_primary_upper = fstar_primary_upper_threaded[Threads.threadid()]
+        fstar_primary_lower = fstar_primary_lower_threaded[Threads.threadid()]
+        fstar_secondary_upper = fstar_secondary_upper_threaded[Threads.threadid()]
+        fstar_secondary_lower = fstar_secondary_lower_threaded[Threads.threadid()]
+
+        # Calculate fluxes
+        orientation = orientations[mortar]
+        calc_fstar!(fstar_primary_upper, equations, surface_flux, dg, u_upper, mortar,
+                    orientation)
+        calc_fstar!(fstar_primary_lower, equations, surface_flux, dg, u_lower, mortar,
+                    orientation)
+        calc_fstar!(fstar_secondary_upper, equations, surface_flux, dg, u_upper, mortar,
+                    orientation)
+        calc_fstar!(fstar_secondary_lower, equations, surface_flux, dg, u_lower, mortar,
+                    orientation)
+
+        mortar_fluxes_to_elements!(surface_flux_values,
+                                   mesh, equations, mortar_l2, dg, cache,
+                                   mortar, fstar_primary_upper, fstar_primary_lower,
+                                   fstar_secondary_upper, fstar_secondary_lower)
+    end
+
+    return nothing
+end
+
+function calc_mortar_flux!(surface_flux_values,
+                           mesh::TreeMesh{2},
+                           nonconservative_terms::True, equations,
+                           mortar_l2::LobattoLegendreMortarL2,
+                           surface_integral, dg::DG, cache)
+    surface_flux, nonconservative_flux = surface_integral.surface_flux
+    @unpack u_lower, u_upper, orientations, large_sides = cache.mortars
+    @unpack (fstar_primary_upper_threaded, fstar_primary_lower_threaded,
+    fstar_secondary_upper_threaded, fstar_secondary_lower_threaded) = cache
+
+    @threaded for mortar in eachmortar(dg, cache)
+        # Choose thread-specific pre-allocated container
+        fstar_primary_upper = fstar_primary_upper_threaded[Threads.threadid()]
+        fstar_primary_lower = fstar_primary_lower_threaded[Threads.threadid()]
+        fstar_secondary_upper = fstar_secondary_upper_threaded[Threads.threadid()]
+        fstar_secondary_lower = fstar_secondary_lower_threaded[Threads.threadid()]
+
+        # Calculate fluxes
+        orientation = orientations[mortar]
+        calc_fstar!(fstar_primary_upper, equations, surface_flux, dg, u_upper, mortar,
+                    orientation)
+        calc_fstar!(fstar_primary_lower, equations, surface_flux, dg, u_lower, mortar,
+                    orientation)
+        calc_fstar!(fstar_secondary_upper, equations, surface_flux, dg, u_upper, mortar,
+                    orientation)
+        calc_fstar!(fstar_secondary_lower, equations, surface_flux, dg, u_lower, mortar,
+                    orientation)
+
+        # Add nonconservative fluxes.
+        # These need to be adapted on the geometry (left/right) since the order of
+        # the arguments matters, based on the global SBP operator interpretation.
+        # The same interpretation (global SBP operators coupled discontinuously via
+        # central fluxes/SATs) explains why we need the factor 0.5.
+        # Alternatively, you can also follow the argumentation of Bohm et al. 2018
+        # ("nonconservative diamond flux")
+        if large_sides[mortar] == 1 # -> small elements on right side
+            for i in eachnode(dg)
+                # Pull the left and right solutions
+                u_upper_ll, u_upper_rr = get_surface_node_vars(u_upper, equations, dg,
+                                                               i, mortar)
+                u_lower_ll, u_lower_rr = get_surface_node_vars(u_lower, equations, dg,
+                                                               i, mortar)
+                # Call pointwise nonconservative term
+                noncons_primary_upper = nonconservative_flux(u_upper_ll, u_upper_rr,
+                                                             orientation, equations)
+                noncons_primary_lower = nonconservative_flux(u_lower_ll, u_lower_rr,
+                                                             orientation, equations)
+                noncons_secondary_upper = nonconservative_flux(u_upper_rr, u_upper_ll,
+                                                               orientation, equations)
+                noncons_secondary_lower = nonconservative_flux(u_lower_rr, u_lower_ll,
+                                                               orientation, equations)
+                # Add to primary and secondary temporary storage
+                multiply_add_to_node_vars!(fstar_primary_upper, 0.5f0,
+                                           noncons_primary_upper, equations,
+                                           dg, i)
+                multiply_add_to_node_vars!(fstar_primary_lower, 0.5f0,
+                                           noncons_primary_lower, equations,
+                                           dg, i)
+                multiply_add_to_node_vars!(fstar_secondary_upper, 0.5f0,
+                                           noncons_secondary_upper, equations,
+                                           dg, i)
+                multiply_add_to_node_vars!(fstar_secondary_lower, 0.5f0,
+                                           noncons_secondary_lower, equations,
+                                           dg, i)
+            end
+        else # large_sides[mortar] == 2 -> small elements on the left
+            for i in eachnode(dg)
+                # Pull the left and right solutions
+                u_upper_ll, u_upper_rr = get_surface_node_vars(u_upper, equations, dg,
+                                                               i, mortar)
+                u_lower_ll, u_lower_rr = get_surface_node_vars(u_lower, equations, dg,
+                                                               i, mortar)
+                # Call pointwise nonconservative term
+                noncons_primary_upper = nonconservative_flux(u_upper_rr, u_upper_ll,
+                                                             orientation, equations)
+                noncons_primary_lower = nonconservative_flux(u_lower_rr, u_lower_ll,
+                                                             orientation, equations)
+                noncons_secondary_upper = nonconservative_flux(u_upper_ll, u_upper_rr,
+                                                               orientation, equations)
+                noncons_secondary_lower = nonconservative_flux(u_lower_ll, u_lower_rr,
+                                                               orientation, equations)
+                # Add to primary and secondary temporary storage
+                multiply_add_to_node_vars!(fstar_primary_upper, 0.5f0,
+                                           noncons_primary_upper, equations,
+                                           dg, i)
+                multiply_add_to_node_vars!(fstar_primary_lower, 0.5f0,
+                                           noncons_primary_lower, equations,
+                                           dg, i)
+                multiply_add_to_node_vars!(fstar_secondary_upper, 0.5f0,
+                                           noncons_secondary_upper, equations,
+                                           dg, i)
+                multiply_add_to_node_vars!(fstar_secondary_lower, 0.5f0,
+                                           noncons_secondary_lower, equations,
+                                           dg, i)
+            end
+        end
+
+        mortar_fluxes_to_elements!(surface_flux_values,
+                                   mesh, equations, mortar_l2, dg, cache,
+                                   mortar, fstar_primary_upper, fstar_primary_lower,
+                                   fstar_secondary_upper, fstar_secondary_lower)
+    end
+
+    return nothing
+end
+
+@inline function calc_fstar!(destination::AbstractArray{<:Any, 2}, equations,
+                             surface_flux, dg::DGSEM,
+                             u_interfaces, interface, orientation)
+    for i in eachnode(dg)
+        # Call pointwise two-point numerical flux function
+        u_ll, u_rr = get_surface_node_vars(u_interfaces, equations, dg, i, interface)
+        flux = surface_flux(u_ll, u_rr, orientation, equations)
+
+        # Copy flux to left and right element storage
+        set_node_vars!(destination, flux, equations, dg, i)
+    end
+
+    return nothing
+end
+
+@inline function mortar_fluxes_to_elements!(surface_flux_values,
+                                            mesh::TreeMesh{2}, equations,
+                                            mortar_l2::LobattoLegendreMortarL2,
+                                            dg::DGSEM, cache,
+                                            mortar, fstar_primary_upper,
+                                            fstar_primary_lower,
+                                            fstar_secondary_upper,
+                                            fstar_secondary_lower)
+    large_element = cache.mortars.neighbor_ids[3, mortar]
+    upper_element = cache.mortars.neighbor_ids[2, mortar]
+    lower_element = cache.mortars.neighbor_ids[1, mortar]
+
+    # Copy flux small to small
+    if cache.mortars.large_sides[mortar] == 1 # -> small elements on right side
+        if cache.mortars.orientations[mortar] == 1
+            # L2 mortars in x-direction
+            direction = 1
+        else
+            # L2 mortars in y-direction
+            direction = 3
+        end
+    else # large_sides[mortar] == 2 -> small elements on left side
+        if cache.mortars.orientations[mortar] == 1
+            # L2 mortars in x-direction
+            direction = 2
+        else
+            # L2 mortars in y-direction
+            direction = 4
+        end
+    end
+    surface_flux_values[:, :, direction, upper_element] .= fstar_primary_upper
+    surface_flux_values[:, :, direction, lower_element] .= fstar_primary_lower
+
+    # Project small fluxes to large element
+    if cache.mortars.large_sides[mortar] == 1 # -> large element on left side
+        if cache.mortars.orientations[mortar] == 1
+            # L2 mortars in x-direction
+            direction = 2
+        else
+            # L2 mortars in y-direction
+            direction = 4
+        end
+    else # large_sides[mortar] == 2 -> large element on right side
+        if cache.mortars.orientations[mortar] == 1
+            # L2 mortars in x-direction
+            direction = 1
+        else
+            # L2 mortars in y-direction
+            direction = 3
+        end
+    end
+
+    # TODO: Taal performance
+    # for v in eachvariable(equations)
+    #   # The code below is semantically equivalent to
+    #   # surface_flux_values[v, :, direction, large_element] .=
+    #   #   (mortar_l2.reverse_upper * fstar_upper[v, :] + mortar_l2.reverse_lower * fstar_lower[v, :])
+    #   # but faster and does not allocate.
+    #   # Note that `true * some_float == some_float` in Julia, i.e. `true` acts as
+    #   # a universal `one`. Hence, the second `mul!` means "add the matrix-vector
+    #   # product to the current value of the destination".
+    #   @views mul!(surface_flux_values[v, :, direction, large_element],
+    #               mortar_l2.reverse_upper, fstar_upper[v, :])
+    #   @views mul!(surface_flux_values[v, :, direction, large_element],
+    #               mortar_l2.reverse_lower,  fstar_lower[v, :], true, true)
+    # end
+    # The code above could be replaced by the following code. However, the relative efficiency
+    # depends on the types of fstar_upper/fstar_lower and dg.l2mortar_reverse_upper.
+    # Using StaticArrays for both makes the code above faster for common test cases.
+    multiply_dimensionwise!(view(surface_flux_values, :, :, direction, large_element),
+                            mortar_l2.reverse_upper, fstar_secondary_upper,
+                            mortar_l2.reverse_lower, fstar_secondary_lower)
+
+    return nothing
+end
+
+function calc_surface_integral!(du, u,
+                                mesh::Union{TreeMesh{2}, StructuredMesh{2},
+                                            StructuredMeshView{2}},
+                                equations, surface_integral::SurfaceIntegralWeakForm,
+                                dg::DG, cache)
+    @unpack boundary_interpolation = dg.basis
+    @unpack surface_flux_values = cache.elements
+
+    # Note that all fluxes have been computed with outward-pointing normal vectors.
+    # Access the factors only once before beginning the loop to increase performance.
+    # We also use explicit assignments instead of `+=` to let `@muladd` turn these
+    # into FMAs (see comment at the top of the file).
+    factor_1 = boundary_interpolation[1, 1]
+    factor_2 = boundary_interpolation[nnodes(dg), 2]
+    @threaded for element in eachelement(dg, cache)
+        for l in eachnode(dg)
+            for v in eachvariable(equations)
+                # surface at -x
+                du[v, 1, l, element] = (du[v, 1, l, element] -
+                                        surface_flux_values[v, l, 1, element] *
+                                        factor_1)
+
+                # surface at +x
+                du[v, nnodes(dg), l, element] = (du[v, nnodes(dg), l, element] +
+                                                 surface_flux_values[v, l, 2, element] *
+                                                 factor_2)
+
+                # surface at -y
+                du[v, l, 1, element] = (du[v, l, 1, element] -
+                                        surface_flux_values[v, l, 3, element] *
+                                        factor_1)
+
+                # surface at +y
+                du[v, l, nnodes(dg), element] = (du[v, l, nnodes(dg), element] +
+                                                 surface_flux_values[v, l, 4, element] *
+                                                 factor_2)
+            end
+        end
+    end
+
+    return nothing
+end
+
+function apply_jacobian!(du, mesh::TreeMesh{2},
+                         equations, dg::DG, cache)
+    @unpack inverse_jacobian = cache.elements
+
+    @threaded for element in eachelement(dg, cache)
+        factor = -inverse_jacobian[element]
+
+        for j in eachnode(dg), i in eachnode(dg)
+            for v in eachvariable(equations)
+                du[v, i, j, element] *= factor
+            end
+        end
+    end
+
+    return nothing
+end
+
+# TODO: Taal dimension agnostic
+function calc_sources!(du, u, t, source_terms::Nothing,
+                       equations::AbstractEquations{2}, dg::DG, cache)
+    return nothing
+end
+
+function calc_sources!(du, u, t, source_terms,
+                       equations::AbstractEquations{2}, dg::DG, cache)
+    @unpack node_coordinates = cache.elements
+
+    @threaded for element in eachelement(dg, cache)
+        for j in eachnode(dg), i in eachnode(dg)
+            u_local = get_node_vars(u, equations, dg, i, j, element)
+            x_local = get_node_coords(node_coordinates, equations, dg,
+                                      i, j, element)
+            du_local = source_terms(u_local, x_local, t, equations)
+            add_to_node_vars!(du, du_local, equations, dg, i, j, element)
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_unstructured/dg_2d.jl b/src/solvers/dgsem_unstructured/dg_2d.jl
index 60358082d1b..2d1d09894a1 100644
--- a/src/solvers/dgsem_unstructured/dg_2d.jl
+++ b/src/solvers/dgsem_unstructured/dg_2d.jl
@@ -309,16 +309,14 @@ function prolong2boundaries!(cache, u,
 end
 
 function calc_boundary_flux!(cache, t, boundary_condition::BoundaryConditionPeriodic,
-                             mesh::Union{UnstructuredMesh2D, P4estMesh, P4estMeshView,
-                                         T8codeMesh},
+                             mesh::Union{UnstructuredMesh2D, P4estMesh, T8codeMesh},
                              equations, surface_integral, dg::DG)
     @assert isempty(eachboundary(dg, cache))
 end
 
 # Function barrier for type stability
 function calc_boundary_flux!(cache, t, boundary_conditions,
-                             mesh::Union{UnstructuredMesh2D, P4estMesh, P4estMeshView,
-                                         T8codeMesh},
+                             mesh::Union{UnstructuredMesh2D, P4estMesh, T8codeMesh},
                              equations, surface_integral, dg::DG)
     @unpack boundary_condition_types, boundary_indices = boundary_conditions
 
@@ -327,11 +325,23 @@ function calc_boundary_flux!(cache, t, boundary_conditions,
     return nothing
 end
 
+# Function barrier for type stability
+function calc_boundary_flux!(cache, t, boundary_conditions,
+                             mesh::P4estMeshView,
+                             equations, surface_integral, dg::DG, u_global, semi)
+    @unpack boundary_condition_types, boundary_indices = boundary_conditions
+
+    calc_boundary_flux_by_type!(cache, t, boundary_condition_types, boundary_indices,
+                                mesh, equations, surface_integral, dg, u_global, semi)
+    return nothing
+end
+
 # Iterate over tuples of boundary condition types and associated indices
 # in a type-stable way using "lispy tuple programming".
 function calc_boundary_flux_by_type!(cache, t, BCs::NTuple{N, Any},
                                      BC_indices::NTuple{N, Vector{Int}},
                                      mesh::Union{UnstructuredMesh2D, P4estMesh,
+                                                 P4estMeshView,
                                                  T8codeMesh},
                                      equations, surface_integral, dg::DG) where {N}
     # Extract the boundary condition type and index vector
@@ -353,6 +363,32 @@ function calc_boundary_flux_by_type!(cache, t, BCs::NTuple{N, Any},
     return nothing
 end
 
+# Iterate over tuples of boundary condition types and associated indices
+# in a type-stable way using "lispy tuple programming".
+function calc_boundary_flux_by_type!(cache, t, BCs::NTuple{N, Any},
+                                     BC_indices::NTuple{N, Vector{Int}},
+                                     mesh::P4estMeshView,
+                                     equations, surface_integral, dg::DG,
+                                     u_global, semi) where {N}
+    # Extract the boundary condition type and index vector
+    boundary_condition = first(BCs)
+    boundary_condition_indices = first(BC_indices)
+    # Extract the remaining types and indices to be processed later
+    remaining_boundary_conditions = Base.tail(BCs)
+    remaining_boundary_condition_indices = Base.tail(BC_indices)
+
+    # process the first boundary condition type
+    calc_boundary_flux!(cache, t, boundary_condition, boundary_condition_indices,
+                        mesh, equations, surface_integral, dg, u_global, semi)
+
+    # recursively call this method with the unprocessed boundary types
+    calc_boundary_flux_by_type!(cache, t, remaining_boundary_conditions,
+                                remaining_boundary_condition_indices,
+                                mesh, equations, surface_integral, dg, u_global, semi)
+
+    return nothing
+end
+
 # terminate the type-stable iteration over tuples
 function calc_boundary_flux_by_type!(cache, t, BCs::Tuple{}, BC_indices::Tuple{},
                                      mesh::Union{UnstructuredMesh2D, P4estMesh,
@@ -361,6 +397,14 @@ function calc_boundary_flux_by_type!(cache, t, BCs::Tuple{}, BC_indices::Tuple{}
     nothing
 end
 
+# terminate the type-stable iteration over tuples
+function calc_boundary_flux_by_type!(cache, t, BCs::Tuple{}, BC_indices::Tuple{},
+                                     mesh::P4estMeshView,
+                                     equations, surface_integral, dg::DG, u_global,
+                                     semi)
+    nothing
+end
+
 function calc_boundary_flux!(cache, t, boundary_condition::BC, boundary_indexing,
                              mesh::UnstructuredMesh2D, equations,
                              surface_integral, dg::DG) where {BC}
diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl
index 55401a1d129..496bf169aa6 100644
--- a/test/test_p4est_2d.jl
+++ b/test/test_p4est_2d.jl
@@ -12,7 +12,7 @@ outdir = "out"
 isdir(outdir) && rm(outdir, recursive = true)
 
 @testset "P4estMesh2D" begin
-#! format: noindent
+    # ! format: noindent
 
 @trixi_testset "elixir_advection_basic.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic.jl"),
@@ -110,24 +110,21 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
-@trixi_testset "elixir_advection_meshview.jl" begin
-    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_meshview.jl"),
-                        l2=[0.00013773915040249946],
-                        linf=[0.0010140184322192658])
+@trixi_testset "elixir_advection_coupled.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_coupled.jl"),
+                        l2=[0., 0.],
+                        linf=[0., 0.])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
-    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    #     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 
     # Ensure we cover the calculation of the node coordinates
     node_coordinates = typeof(parent_mesh.tree_node_coordinates)(undef, 2,
                                                                  ntuple(_ -> length(parent_mesh.nodes),
                                                                         2)...,
-                                                                 length(mesh.cell_ids))
-    result = Trixi.calc_node_coordinates!(node_coordinates, mesh, parent_mesh.nodes)
-    @test parent_mesh.tree_node_coordinates == result
-
+                                                                 length(mesh1.cell_ids))
     # Load the mesh file for code coverage.
-    loaded_mesh = Trixi.load_mesh_serial(joinpath("out", "mesh.h5"); n_cells_max = 0,
+    loaded_mesh = Trixi.load_mesh_serial(joinpath(EXAMPLES_DIR, "out", "mesh_1.h5"); n_cells_max = 0,
                                          RealT = typeof(parent_mesh).parameters[3])
 end
 
@@ -574,7 +571,47 @@ end
                             1.4237578427628152e-6])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
-    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    let
+        t = sol.t[end]
+        u_ode = sol.u[end]
+        du_ode = similar(u_ode)
+        @test (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
+    end
+
+@trixi_testset "elixir_euler_mhd_coupled.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_mhd_coupled.jl"),
+                        l2=[0.009862595305604965,
+                            0.011874205535856063,
+                            5.0185914245237475e-6,
+                            0.0,
+                            0.024657539839658474,
+                            0.0,
+                            0.0,
+                            0.0,
+                            0.0,
+                            0.0098589277826056,
+                            0.011870558900297097,
+                            6.882386285170543e-6,
+                            0.024648257743835045
+                            ],
+                        linf=[0.013719847889148373,
+                              0.01678917375613853,
+                              2.933466212909218e-5,
+                              0.0,
+                              0.03429795097747568,
+                              0.0,
+                              0.0,
+                              0.0,
+                              0.0,
+                              0.01368217970493435,
+                              0.016790901855796785,
+                              3.091328454846926e-5,
+                              0.034236712653821444])
+    # Ensure we cover the calculation of the node coordinates
+    node_coordinates = typeof(parent_mesh.tree_node_coordinates)(undef, 2,
+                                                                 ntuple(_ -> length(parent_mesh.nodes),
+                                                                        2)...,
+                                                                 length(mesh1.cell_ids))
 end
 
 @trixi_testset "elixir_mhd_alfven_wave_nonconforming.jl" begin