diff --git a/NEWS.md b/NEWS.md
index 0290b08acd5..aaee21744f9 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -6,6 +6,14 @@ used in the Julia ecosystem. Notable changes will be documented in this file
 for human readability.
 
 
+## Changes in the v0.13 lifecycle
+
+#### Added
+- Initial 3D support for subcell limiting with `P4estMesh` was added ([#2582]).
+  In the new version, positivity limiting for conservative variables (using `positivity_variables_cons`)
+  is supported. `BoundsCheckCallback` is not supported in 3D yet.
+
+
 ## Changes when updating to v0.13 from v0.12.x
 
 #### Changed
diff --git a/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl b/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl
new file mode 100644
index 00000000000..05bd1e3db1b
--- /dev/null
+++ b/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl
@@ -0,0 +1,98 @@
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+"""
+    initial_condition_sedov_blast_wave(x, t, equations::CompressibleEulerEquations3D)
+
+The Sedov blast wave setup based on Flash
+- https://flash.rochester.edu/site/flashcode/user_support/flash_ug_devel/node187.html#SECTION010114000000000000000
+with smaller strength of the initial discontinuity.
+"""
+function initial_condition_sedov_blast_wave(x, t,
+                                            equations::CompressibleEulerEquations3D)
+    # Set up polar coordinates
+    inicenter = SVector(0.0, 0.0, 0.0)
+    x_norm = x[1] - inicenter[1]
+    y_norm = x[2] - inicenter[2]
+    z_norm = x[3] - inicenter[3]
+    r = sqrt(x_norm^2 + y_norm^2 + z_norm^2)
+
+    # Setup based on https://flash.rochester.edu/site/flashcode/user_support/flash_ug_devel/node187.html#SECTION010114000000000000000
+    r0 = 0.21875 # = 3.5 * smallest dx (for domain length=4 and max-ref=6)
+    E = 1.0
+    p0_inner = 3 * (equations.gamma - 1) * E / (4 * pi * r0^2)
+    p0_outer = 1.0e-1 # "simpler" setup since positivity limiter for pressure is not yet supported in 3D
+
+    # Calculate primitive variables
+    rho = 1.0
+    v1 = 0.0
+    v2 = 0.0
+    v3 = 0.0
+    p = r > r0 ? p0_outer : p0_inner
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+initial_condition = initial_condition_sedov_blast_wave
+
+surface_flux = flux_lax_friedrichs
+volume_flux = flux_ranocha
+polydeg = 3
+basis = LobattoLegendreBasis(polydeg)
+limiter_idp = SubcellLimiterIDP(equations, basis;
+                                positivity_variables_cons = ["rho"])
+volume_integral = VolumeIntegralSubcellLimiting(limiter_idp;
+                                                volume_flux_dg = volume_flux,
+                                                volume_flux_fv = surface_flux)
+solver = DGSEM(basis, surface_flux, volume_integral)
+
+coordinates_min = (-1.0, -1.0, -1.0)
+coordinates_max = (1.0, 1.0, 1.0)
+
+trees_per_dimension = (8, 8, 8)
+mesh = P4estMesh(trees_per_dimension,
+                 polydeg = 1, initial_refinement_level = 0,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 periodicity = true)
+
+# create the semi discretization object
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 3.0)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+save_solution = SaveSolutionCallback(interval = 10,
+                                     save_initial_solution = true,
+                                     save_final_solution = true,
+                                     extra_node_variables = (:limiting_coefficient,))
+
+stepsize_callback = StepsizeCallback(cfl = 0.5)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback,
+                        alive_callback,
+                        save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+stage_callbacks = (SubcellLimiterIDPCorrection(),)
+
+sol = Trixi.solve(ode, Trixi.SimpleSSPRK33(stage_callbacks = stage_callbacks);
+                  dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+                  callback = callbacks);
diff --git a/src/callbacks_stage/subcell_limiter_idp_correction.jl b/src/callbacks_stage/subcell_limiter_idp_correction.jl
index e1cb42035d1..69e4fc62d4e 100644
--- a/src/callbacks_stage/subcell_limiter_idp_correction.jl
+++ b/src/callbacks_stage/subcell_limiter_idp_correction.jl
@@ -64,4 +64,5 @@ init_callback(limiter!::SubcellLimiterIDPCorrection, semi) = nothing
 finalize_callback(limiter!::SubcellLimiterIDPCorrection, semi) = nothing
 
 include("subcell_limiter_idp_correction_2d.jl")
+include("subcell_limiter_idp_correction_3d.jl")
 end # @muladd
diff --git a/src/callbacks_stage/subcell_limiter_idp_correction_3d.jl b/src/callbacks_stage/subcell_limiter_idp_correction_3d.jl
new file mode 100644
index 00000000000..09269a89593
--- /dev/null
+++ b/src/callbacks_stage/subcell_limiter_idp_correction_3d.jl
@@ -0,0 +1,87 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+function perform_idp_correction!(u, dt,
+                                 mesh::P4estMesh{3},
+                                 equations, dg, cache)
+    @unpack inverse_weights = dg.basis # Plays role of DG subcell sizes
+    @unpack antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
+    @unpack alpha = dg.volume_integral.limiter.cache.subcell_limiter_coefficients
+
+    @threaded for element in eachelement(dg, cache)
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            # Sign switch as in apply_jacobian!
+            inverse_jacobian = -get_inverse_jacobian(cache.elements.inverse_jacobian,
+                                                     mesh, i, j, k, element)
+
+            # Note: For LGL nodes, the high-order and low-order fluxes at element interfaces are equal.
+            # Therefore, the antidiffusive fluxes are zero.
+            # To avoid accessing zero entries, we directly use zero vectors instead.
+            if i > 1 # Not at "left" boundary node
+                alpha1 = max(alpha[i - 1, j, k, element], alpha[i, j, k, element])
+                alpha_flux1 = (1 - alpha1) *
+                              get_node_vars(antidiffusive_flux1_R, equations, dg,
+                                            i, j, k, element)
+            else # At "left" boundary node
+                alpha_flux1 = zero(SVector{nvariables(equations), eltype(u)})
+            end
+            if i < nnodes(dg) # Not at "right" boundary node
+                alpha1_ip1 = max(alpha[i, j, k, element], alpha[i + 1, j, k, element])
+                alpha_flux1_ip1 = (1 - alpha1_ip1) *
+                                  get_node_vars(antidiffusive_flux1_L, equations, dg,
+                                                i + 1, j, k, element)
+            else # At "right" boundary node
+                alpha_flux1_ip1 = zero(SVector{nvariables(equations), eltype(u)})
+            end
+            if j > 1 # Not at "bottom" boundary node
+                alpha2 = max(alpha[i, j - 1, k, element], alpha[i, j, k, element])
+                alpha_flux2 = (1 - alpha2) *
+                              get_node_vars(antidiffusive_flux2_R, equations, dg,
+                                            i, j, k, element)
+            else # At "bottom" boundary node
+                alpha_flux2 = zero(SVector{nvariables(equations), eltype(u)})
+            end
+            if j < nnodes(dg) # Not at "top" boundary node
+                alpha2_jp1 = max(alpha[i, j, k, element], alpha[i, j + 1, k, element])
+                alpha_flux2_jp1 = (1 - alpha2_jp1) *
+                                  get_node_vars(antidiffusive_flux2_L, equations, dg,
+                                                i, j + 1, k, element)
+            else # At "top" boundary node
+                alpha_flux2_jp1 = zero(SVector{nvariables(equations), eltype(u)})
+            end
+            if k > 1 # Not at "front" boundary node
+                alpha3 = max(alpha[i, j, k - 1, element], alpha[i, j, k, element])
+                alpha_flux3 = (1 - alpha3) *
+                              get_node_vars(antidiffusive_flux3_R, equations, dg,
+                                            i, j, k, element)
+            else # At "front" boundary node
+                alpha_flux3 = zero(SVector{nvariables(equations), eltype(u)})
+            end
+            if k < nnodes(dg) # Not at "back" boundary node
+                alpha3_kp1 = max(alpha[i, j, k, element], alpha[i, j, k + 1, element])
+                alpha_flux3_kp1 = (1 - alpha3_kp1) *
+                                  get_node_vars(antidiffusive_flux3_L, equations, dg,
+                                                i, j, k + 1, element)
+            else # At "back" boundary node
+                alpha_flux3_kp1 = zero(SVector{nvariables(equations), eltype(u)})
+            end
+
+            for v in eachvariable(equations)
+                u[v, i, j, k, element] += dt * inverse_jacobian *
+                                          (inverse_weights[i] *
+                                           (alpha_flux1_ip1[v] - alpha_flux1[v]) +
+                                           inverse_weights[j] *
+                                           (alpha_flux2_jp1[v] - alpha_flux2[v]) +
+                                           inverse_weights[k] *
+                                           (alpha_flux3_kp1[v] - alpha_flux3[v]))
+            end
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg.jl b/src/solvers/dgsem_p4est/dg.jl
index b0a8f0e07b0..82007821c44 100644
--- a/src/solvers/dgsem_p4est/dg.jl
+++ b/src/solvers/dgsem_p4est/dg.jl
@@ -91,4 +91,6 @@ include("dg_3d_parabolic.jl")
 include("dg_parallel.jl")
 
 include("subcell_limiters_2d.jl")
+include("subcell_limiters_3d.jl")
+include("dg_3d_subcell_limiters.jl")
 end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
new file mode 100644
index 00000000000..0907b120d07
--- /dev/null
+++ b/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
@@ -0,0 +1,315 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+function create_cache(mesh::P4estMesh{3},
+                      equations, volume_integral::VolumeIntegralSubcellLimiting,
+                      dg::DG, uEltype)
+    cache = create_cache(mesh, equations,
+                         VolumeIntegralPureLGLFiniteVolume(volume_integral.volume_flux_fv),
+                         dg, uEltype)
+
+    A4d = Array{uEltype, 4}
+
+    fhat1_L_threaded = A4d[A4d(undef, nvariables(equations),
+                               nnodes(dg) + 1, nnodes(dg), nnodes(dg))
+                           for _ in 1:Threads.maxthreadid()]
+    fhat1_R_threaded = A4d[A4d(undef, nvariables(equations),
+                               nnodes(dg) + 1, nnodes(dg), nnodes(dg))
+                           for _ in 1:Threads.maxthreadid()]
+    fhat2_L_threaded = A4d[A4d(undef, nvariables(equations),
+                               nnodes(dg), nnodes(dg) + 1, nnodes(dg))
+                           for _ in 1:Threads.maxthreadid()]
+    fhat2_R_threaded = A4d[A4d(undef, nvariables(equations),
+                               nnodes(dg), nnodes(dg) + 1, nnodes(dg))
+                           for _ in 1:Threads.maxthreadid()]
+    fhat3_L_threaded = A4d[A4d(undef, nvariables(equations),
+                               nnodes(dg), nnodes(dg), nnodes(dg) + 1)
+                           for _ in 1:Threads.maxthreadid()]
+    fhat3_R_threaded = A4d[A4d(undef, nvariables(equations),
+                               nnodes(dg), nnodes(dg), nnodes(dg) + 1)
+                           for _ in 1:Threads.maxthreadid()]
+
+    flux_temp_threaded = A4d[A4d(undef, nvariables(equations),
+                                 nnodes(dg), nnodes(dg), nnodes(dg))
+                             for _ in 1:Threads.maxthreadid()]
+    fhat_temp_threaded = A4d[A4d(undef, nvariables(equations),
+                                 nnodes(dg), nnodes(dg), nnodes(dg))
+                             for _ in 1:Threads.maxthreadid()]
+
+    antidiffusive_fluxes = ContainerAntidiffusiveFlux3D{uEltype}(0,
+                                                                 nvariables(equations),
+                                                                 nnodes(dg))
+
+    if have_nonconservative_terms(equations) == true
+        error("Unsupported system of equations with nonconservative terms")
+    end
+
+    return (; cache..., antidiffusive_fluxes,
+            fhat1_L_threaded, fhat1_R_threaded,
+            fhat2_L_threaded, fhat2_R_threaded,
+            fhat3_L_threaded, fhat3_R_threaded,
+            flux_temp_threaded, fhat_temp_threaded)
+end
+
+@inline function subcell_limiting_kernel!(du, u, element,
+                                          mesh::P4estMesh{3},
+                                          nonconservative_terms, equations,
+                                          volume_integral, limiter::SubcellLimiterIDP,
+                                          dg::DGSEM, cache)
+    @unpack inverse_weights = dg.basis # Plays role of DG subcell sizes
+    @unpack volume_flux_dg, volume_flux_fv = volume_integral
+
+    # high-order DG fluxes
+    @unpack fhat1_L_threaded, fhat1_R_threaded, fhat2_L_threaded, fhat2_R_threaded, fhat3_L_threaded, fhat3_R_threaded = cache
+
+    fhat1_L = fhat1_L_threaded[Threads.threadid()]
+    fhat1_R = fhat1_R_threaded[Threads.threadid()]
+    fhat2_L = fhat2_L_threaded[Threads.threadid()]
+    fhat2_R = fhat2_R_threaded[Threads.threadid()]
+    fhat3_L = fhat3_L_threaded[Threads.threadid()]
+    fhat3_R = fhat3_R_threaded[Threads.threadid()]
+    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                   u, mesh, nonconservative_terms, equations, volume_flux_dg,
+                   dg, element, cache)
+
+    # low-order FV fluxes
+    @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded, fstar3_L_threaded, fstar3_R_threaded = cache
+
+    fstar1_L = fstar1_L_threaded[Threads.threadid()]
+    fstar1_R = fstar1_R_threaded[Threads.threadid()]
+    fstar2_L = fstar2_L_threaded[Threads.threadid()]
+    fstar2_R = fstar2_R_threaded[Threads.threadid()]
+    fstar3_L = fstar3_L_threaded[Threads.threadid()]
+    fstar3_R = fstar3_R_threaded[Threads.threadid()]
+    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
+                 u, mesh, nonconservative_terms, equations, volume_flux_fv,
+                 dg, element, cache)
+
+    # antidiffusive flux
+    calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                            fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
+                            u, mesh, nonconservative_terms, equations, limiter,
+                            dg, element, cache)
+
+    # Calculate volume integral contribution of low-order FV flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            du[v, i, j, k, element] += inverse_weights[i] *
+                                       (fstar1_L[v, i + 1, j, k] - fstar1_R[v, i, j, k]) +
+                                       inverse_weights[j] *
+                                       (fstar2_L[v, i, j + 1, k] - fstar2_R[v, i, j, k]) +
+                                       inverse_weights[k] *
+                                       (fstar3_L[v, i, j, k + 1] - fstar3_R[v, i, j, k])
+        end
+    end
+
+    return nothing
+end
+
+# Calculate the DG staggered volume fluxes `fhat` in subcell FV-form inside the element
+# (**without non-conservative terms**).
+#
+# See also `flux_differencing_kernel!`.
+@inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                                u, mesh::P4estMesh{3},
+                                nonconservative_terms::False, equations,
+                                volume_flux, dg::DGSEM, element, cache)
+    (; contravariant_vectors) = cache.elements
+    (; weights, derivative_split) = dg.basis
+    (; flux_temp_threaded) = cache
+
+    flux_temp = flux_temp_threaded[Threads.threadid()]
+
+    # The FV-form fluxes are calculated in a recursive manner, i.e.:
+    # fhat_(0,1)   = w_0 * FVol_0,
+    # fhat_(j,j+1) = fhat_(j-1,j) + w_j * FVol_j,   for j=1,...,N-1,
+    # with the split form volume fluxes FVol_j = -2 * sum_i=0^N D_ji f*_(j,i).
+
+    # To use the symmetry of the `volume_flux`, the split form volume flux is precalculated
+    # like in `calc_volume_integral!` for the `VolumeIntegralFluxDifferencing`
+    # and saved in in `flux_temp`.
+
+    # Split form volume flux in orientation 1: x direction
+    flux_temp .= zero(eltype(flux_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # pull the contravariant vectors in each coordinate direction
+        Ja1_node = get_contravariant_vector(1, contravariant_vectors,
+                                            i, j, k, element) # x direction
+
+        # All diagonal entries of `derivative_split` are zero. Thus, we can skip
+        # the computation of the diagonal terms. In addition, we use the symmetry
+        # of the `volume_flux` to save half of the possible two-point flux
+        # computations.
+
+        # x direction
+        for ii in (i + 1):nnodes(dg)
+            u_node_ii = get_node_vars(u, equations, dg, ii, j, k, element)
+            # pull the contravariant vectors and compute the average
+            Ja1_node_ii = get_contravariant_vector(1, contravariant_vectors,
+                                                   ii, j, k, element)
+            Ja1_avg = 0.5f0 * (Ja1_node + Ja1_node_ii)
+
+            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
+            fluxtilde1 = volume_flux(u_node, u_node_ii, Ja1_avg, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[i, ii], fluxtilde1,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[ii, i], fluxtilde1,
+                                       equations, dg, ii, j, k)
+        end
+    end
+
+    # FV-form flux `fhat` in x direction
+    fhat1_L[:, 1, :, :] .= zero(eltype(fhat1_L))
+    fhat1_L[:, nnodes(dg) + 1, :, :] .= zero(eltype(fhat1_L))
+    fhat1_R[:, 1, :, :] .= zero(eltype(fhat1_R))
+    fhat1_R[:, nnodes(dg) + 1, :, :] .= zero(eltype(fhat1_R))
+
+    for k in eachnode(dg), j in eachnode(dg), i in 1:(nnodes(dg) - 1),
+        v in eachvariable(equations)
+
+        fhat1_L[v, i + 1, j, k] = fhat1_L[v, i, j, k] +
+                                  weights[i] * flux_temp[v, i, j, k]
+        fhat1_R[v, i + 1, j, k] = fhat1_L[v, i + 1, j, k]
+    end
+
+    # Split form volume flux in orientation 2: y direction
+    flux_temp .= zero(eltype(flux_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # pull the contravariant vectors in each coordinate direction
+        Ja2_node = get_contravariant_vector(2, contravariant_vectors,
+                                            i, j, k, element)
+
+        # y direction
+        for jj in (j + 1):nnodes(dg)
+            u_node_jj = get_node_vars(u, equations, dg, i, jj, k, element)
+            # pull the contravariant vectors and compute the average
+            Ja2_node_jj = get_contravariant_vector(2, contravariant_vectors,
+                                                   i, jj, k, element)
+            Ja2_avg = 0.5f0 * (Ja2_node + Ja2_node_jj)
+            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
+            fluxtilde2 = volume_flux(u_node, u_node_jj, Ja2_avg, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[j, jj], fluxtilde2,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[jj, j], fluxtilde2,
+                                       equations, dg, i, jj, k)
+        end
+    end
+
+    # FV-form flux `fhat` in y direction
+    fhat2_L[:, :, 1, :] .= zero(eltype(fhat2_L))
+    fhat2_L[:, :, nnodes(dg) + 1, :] .= zero(eltype(fhat2_L))
+    fhat2_R[:, :, 1, :] .= zero(eltype(fhat2_R))
+    fhat2_R[:, :, nnodes(dg) + 1, :] .= zero(eltype(fhat2_R))
+
+    for k in eachnode(dg), j in 1:(nnodes(dg) - 1), i in eachnode(dg),
+        v in eachvariable(equations)
+
+        fhat2_L[v, i, j + 1, k] = fhat2_L[v, i, j, k] +
+                                  weights[j] * flux_temp[v, i, j, k]
+        fhat2_R[v, i, j + 1, k] = fhat2_L[v, i, j + 1, k]
+    end
+
+    # Split form volume flux in orientation 3: z direction
+    flux_temp .= zero(eltype(flux_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # pull the contravariant vectors in each coordinate direction
+        Ja3_node = get_contravariant_vector(3, contravariant_vectors,
+                                            i, j, k, element)
+
+        # z direction
+        for kk in (k + 1):nnodes(dg)
+            u_node_kk = get_node_vars(u, equations, dg, i, j, kk, element)
+            # pull the contravariant vectors and compute the average
+            Ja3_node_kk = get_contravariant_vector(3, contravariant_vectors,
+                                                   i, j, kk, element)
+            Ja3_avg = 0.5f0 * (Ja3_node + Ja3_node_kk)
+            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
+            fluxtilde3 = volume_flux(u_node, u_node_kk, Ja3_avg, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[k, kk], fluxtilde3,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[kk, k], fluxtilde3,
+                                       equations, dg, i, j, kk)
+        end
+    end
+
+    # FV-form flux `fhat` in z direction
+    fhat3_L[:, :, :, 1] .= zero(eltype(fhat3_L))
+    fhat3_L[:, :, :, nnodes(dg) + 1] .= zero(eltype(fhat3_L))
+    fhat3_R[:, :, :, 1] .= zero(eltype(fhat3_R))
+    fhat3_R[:, :, :, nnodes(dg) + 1] .= zero(eltype(fhat3_R))
+
+    for k in 1:(nnodes(dg) - 1), j in eachnode(dg), i in eachnode(dg),
+        v in eachvariable(equations)
+
+        fhat3_L[v, i, j, k + 1] = fhat3_L[v, i, j, k] +
+                                  weights[k] * flux_temp[v, i, j, k]
+        fhat3_R[v, i, j, k + 1] = fhat3_L[v, i, j, k + 1]
+    end
+
+    return nothing
+end
+
+# Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
+@inline function calcflux_antidiffusive!(fhat1_L, fhat1_R,
+                                         fhat2_L, fhat2_R,
+                                         fhat3_L, fhat3_R,
+                                         fstar1_L, fstar1_R,
+                                         fstar2_L, fstar2_R,
+                                         fstar3_L, fstar3_R,
+                                         u, mesh::P4estMesh{3},
+                                         nonconservative_terms::False, equations,
+                                         limiter::SubcellLimiterIDP, dg, element, cache)
+    @unpack antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
+
+    # Due to the use of LGL nodes, the DG staggered fluxes `fhat` and FV fluxes `fstar` are equal
+    # on the element interfaces. So, they are not computed in the volume integral and set to zero
+    # in their respective computation.
+    # The antidiffusive fluxes are therefore zero on the element interfaces and don't need to be
+    # computed either. They are set to zero directly after resizing the container.
+    # This applies to the indices `i=1` and `i=nnodes(dg)+1` for `antidiffusive_flux1_L` and
+    # `antidiffusive_flux1_R` and analogously for the other two directions.
+
+    for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux1_L[v, i, j, k, element] = fhat1_L[v, i, j, k] -
+                                                         fstar1_L[v, i, j, k]
+            antidiffusive_flux1_R[v, i, j, k, element] = antidiffusive_flux1_L[v,
+                                                                               i, j, k,
+                                                                               element]
+        end
+    end
+    for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux2_L[v, i, j, k, element] = fhat2_L[v, i, j, k] -
+                                                         fstar2_L[v, i, j, k]
+            antidiffusive_flux2_R[v, i, j, k, element] = antidiffusive_flux2_L[v,
+                                                                               i, j, k,
+                                                                               element]
+        end
+    end
+    for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux3_L[v, i, j, k, element] = fhat3_L[v, i, j, k] -
+                                                         fstar3_L[v, i, j, k]
+            antidiffusive_flux3_R[v, i, j, k, element] = antidiffusive_flux3_L[v,
+                                                                               i, j, k,
+                                                                               element]
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
new file mode 100644
index 00000000000..03ac6b3b561
--- /dev/null
+++ b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
@@ -0,0 +1,83 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+###############################################################################
+# IDP Limiting
+###############################################################################
+
+###############################################################################
+# Global positivity limiting of conservative variables
+
+@inline function idp_positivity_conservative!(alpha, limiter,
+                                              u::AbstractArray{<:Real, 5}, dt, semi,
+                                              variable)
+    mesh, _, dg, cache = mesh_equations_solver_cache(semi)
+    (; antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R) = cache.antidiffusive_fluxes
+    (; inverse_weights) = dg.basis # Plays role of DG subcell sizes
+    (; positivity_correction_factor) = limiter
+
+    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
+    var_min = variable_bounds[Symbol(string(variable), "_min")]
+
+    @threaded for element in eachelement(dg, semi.cache)
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
+                                                    mesh, i, j, k, element)
+            var = u[variable, i, j, k, element]
+            if var < 0
+                error("Safe low-order method produces negative value for conservative variable $variable. Try a smaller time step.")
+            end
+
+            # Compute bound
+            if limiter.local_twosided &&
+               (variable in limiter.local_twosided_variables_cons) &&
+               (var_min[i, j, k, element] >= positivity_correction_factor * var)
+                # Local limiting is more restrictive that positivity limiting
+                # => Skip positivity limiting for this node
+                continue
+            end
+            var_min[i, j, k, element] = positivity_correction_factor * var
+
+            # Real one-sided Zalesak-type limiter
+            # * Zalesak (1979). "Fully multidimensional flux-corrected transport algorithms for fluids"
+            # * Kuzmin et al. (2010). "Failsafe flux limiting and constrained data projections for equations of gas dynamics"
+            # Note: The Zalesak limiter has to be computed, even if the state is valid, because the correction is
+            #       for each interface, not each node
+            Qm = min(0, (var_min[i, j, k, element] - var) / dt)
+
+            # Calculate Pm
+            # Note: Boundaries of antidiffusive_flux1/2/3 are constant 0, so they make no difference here.
+            val_flux1_local = inverse_weights[i] *
+                              antidiffusive_flux1_R[variable, i, j, k, element]
+            val_flux1_local_ip1 = -inverse_weights[i] *
+                                  antidiffusive_flux1_L[variable, i + 1, j, k, element]
+            val_flux2_local = inverse_weights[j] *
+                              antidiffusive_flux2_R[variable, i, j, k, element]
+            val_flux2_local_jp1 = -inverse_weights[j] *
+                                  antidiffusive_flux2_L[variable, i, j + 1, k, element]
+            val_flux3_local = inverse_weights[k] *
+                              antidiffusive_flux3_R[variable, i, j, k, element]
+            val_flux3_local_jp1 = -inverse_weights[k] *
+                                  antidiffusive_flux3_L[variable, i, j, k + 1, element]
+
+            Pm = min(0, val_flux1_local) + min(0, val_flux1_local_ip1) +
+                 min(0, val_flux2_local) + min(0, val_flux2_local_jp1) +
+                 min(0, val_flux3_local) + min(0, val_flux3_local_jp1)
+            Pm = inverse_jacobian * Pm
+
+            # Compute blending coefficient avoiding division by zero
+            # (as in paper of [Guermond, Nazarov, Popov, Thomas] (4.8))
+            Qm = abs(Qm) / (abs(Pm) + eps(typeof(Qm)) * 100)
+
+            # Calculate alpha
+            alpha[i, j, k, element] = max(alpha[i, j, k, element], 1 - Qm)
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_tree/containers_3d.jl b/src/solvers/dgsem_tree/containers_3d.jl
index 413818d9013..052163628e0 100644
--- a/src/solvers/dgsem_tree/containers_3d.jl
+++ b/src/solvers/dgsem_tree/containers_3d.jl
@@ -783,4 +783,150 @@ function init_mortars!(mortars, elements, mesh::TreeMesh3D)
     @assert count==nmortars(mortars) ("Actual mortar count ($count) does not match "*
                                       "expectations $(nmortars(mortars))")
 end
+
+mutable struct ContainerAntidiffusiveFlux3D{uEltype <: Real}
+    antidiffusive_flux1_L::Array{uEltype, 5} # [variables, i, j, k, elements]
+    antidiffusive_flux1_R::Array{uEltype, 5} # [variables, i, j, k, elements]
+    antidiffusive_flux2_L::Array{uEltype, 5} # [variables, i, j, k, elements]
+    antidiffusive_flux2_R::Array{uEltype, 5} # [variables, i, j, k, elements]
+    antidiffusive_flux3_L::Array{uEltype, 5} # [variables, i, j, k, elements]
+    antidiffusive_flux3_R::Array{uEltype, 5} # [variables, i, j, k, elements]
+    # internal `resize!`able storage
+    _antidiffusive_flux1_L::Vector{uEltype}
+    _antidiffusive_flux1_R::Vector{uEltype}
+    _antidiffusive_flux2_L::Vector{uEltype}
+    _antidiffusive_flux2_R::Vector{uEltype}
+    _antidiffusive_flux3_L::Vector{uEltype}
+    _antidiffusive_flux3_R::Vector{uEltype}
+end
+
+function ContainerAntidiffusiveFlux3D{uEltype}(capacity::Integer, n_variables,
+                                               n_nodes) where {uEltype <: Real}
+    nan_uEltype = convert(uEltype, NaN)
+
+    # Initialize fields with defaults
+    _antidiffusive_flux1_L = fill(nan_uEltype,
+                                  n_variables * (n_nodes + 1) * n_nodes * n_nodes *
+                                  capacity)
+    antidiffusive_flux1_L = unsafe_wrap(Array, pointer(_antidiffusive_flux1_L),
+                                        (n_variables, n_nodes + 1, n_nodes, n_nodes,
+                                         capacity))
+    _antidiffusive_flux1_R = fill(nan_uEltype,
+                                  n_variables * (n_nodes + 1) * n_nodes * n_nodes *
+                                  capacity)
+    antidiffusive_flux1_R = unsafe_wrap(Array, pointer(_antidiffusive_flux1_R),
+                                        (n_variables, n_nodes + 1, n_nodes, n_nodes,
+                                         capacity))
+
+    _antidiffusive_flux2_L = fill(nan_uEltype,
+                                  n_variables * n_nodes * (n_nodes + 1) * n_nodes *
+                                  capacity)
+    antidiffusive_flux2_L = unsafe_wrap(Array, pointer(_antidiffusive_flux2_L),
+                                        (n_variables, n_nodes, n_nodes + 1, n_nodes,
+                                         capacity))
+    _antidiffusive_flux2_R = fill(nan_uEltype,
+                                  n_variables * n_nodes * (n_nodes + 1) * n_nodes *
+                                  capacity)
+    antidiffusive_flux2_R = unsafe_wrap(Array, pointer(_antidiffusive_flux2_R),
+                                        (n_variables, n_nodes, n_nodes + 1, n_nodes,
+                                         capacity))
+
+    _antidiffusive_flux3_L = fill(nan_uEltype,
+                                  n_variables * n_nodes * n_nodes * (n_nodes + 1) *
+                                  capacity)
+    antidiffusive_flux3_L = unsafe_wrap(Array, pointer(_antidiffusive_flux3_L),
+                                        (n_variables, n_nodes, n_nodes, n_nodes + 1,
+                                         capacity))
+    _antidiffusive_flux3_R = fill(nan_uEltype,
+                                  n_variables * n_nodes * n_nodes * (n_nodes + 1) *
+                                  capacity)
+    antidiffusive_flux3_R = unsafe_wrap(Array, pointer(_antidiffusive_flux3_R),
+                                        (n_variables, n_nodes, n_nodes, n_nodes + 1,
+                                         capacity))
+    return ContainerAntidiffusiveFlux3D{uEltype}(antidiffusive_flux1_L,
+                                                 antidiffusive_flux1_R,
+                                                 antidiffusive_flux2_L,
+                                                 antidiffusive_flux2_R,
+                                                 antidiffusive_flux3_L,
+                                                 antidiffusive_flux3_R,
+                                                 _antidiffusive_flux1_L,
+                                                 _antidiffusive_flux1_R,
+                                                 _antidiffusive_flux2_L,
+                                                 _antidiffusive_flux2_R,
+                                                 _antidiffusive_flux3_L,
+                                                 _antidiffusive_flux3_R)
+end
+
+nvariables(fluxes::ContainerAntidiffusiveFlux3D) = size(fluxes.antidiffusive_flux1_L, 1)
+nnodes(fluxes::ContainerAntidiffusiveFlux3D) = size(fluxes.antidiffusive_flux1_L, 3)
+
+# Only one-dimensional `Array`s are `resize!`able in Julia.
+# Hence, we use `Vector`s as internal storage and `resize!`
+# them whenever needed. Then, we reuse the same memory by
+# `unsafe_wrap`ping multi-dimensional `Array`s around the
+# internal storage.
+function Base.resize!(fluxes::ContainerAntidiffusiveFlux3D, capacity)
+    n_nodes = nnodes(fluxes)
+    n_variables = nvariables(fluxes)
+
+    @unpack _antidiffusive_flux1_L, _antidiffusive_flux1_R, _antidiffusive_flux2_L, _antidiffusive_flux2_R, _antidiffusive_flux3_L, _antidiffusive_flux3_R = fluxes
+
+    resize!(_antidiffusive_flux1_L,
+            n_variables * (n_nodes + 1) * n_nodes * n_nodes * capacity)
+    fluxes.antidiffusive_flux1_L = unsafe_wrap(Array, pointer(_antidiffusive_flux1_L),
+                                               (n_variables,
+                                                n_nodes + 1, n_nodes, n_nodes,
+                                                capacity))
+    resize!(_antidiffusive_flux1_R,
+            n_variables * (n_nodes + 1) * n_nodes * n_nodes * capacity)
+    fluxes.antidiffusive_flux1_R = unsafe_wrap(Array, pointer(_antidiffusive_flux1_R),
+                                               (n_variables,
+                                                n_nodes + 1, n_nodes, n_nodes,
+                                                capacity))
+    resize!(_antidiffusive_flux2_L,
+            n_variables * n_nodes * (n_nodes + 1) * n_nodes * capacity)
+    fluxes.antidiffusive_flux2_L = unsafe_wrap(Array, pointer(_antidiffusive_flux2_L),
+                                               (n_variables,
+                                                n_nodes, n_nodes + 1, n_nodes,
+                                                capacity))
+    resize!(_antidiffusive_flux2_R,
+            n_variables * n_nodes * (n_nodes + 1) * n_nodes * capacity)
+    fluxes.antidiffusive_flux2_R = unsafe_wrap(Array, pointer(_antidiffusive_flux2_R),
+                                               (n_variables,
+                                                n_nodes, n_nodes + 1, n_nodes,
+                                                capacity))
+
+    resize!(_antidiffusive_flux3_L,
+            n_variables * n_nodes * n_nodes * (n_nodes + 1) * capacity)
+    fluxes.antidiffusive_flux3_L = unsafe_wrap(Array, pointer(_antidiffusive_flux3_L),
+                                               (n_variables,
+                                                n_nodes, n_nodes, n_nodes + 1,
+                                                capacity))
+    resize!(_antidiffusive_flux3_R,
+            n_variables * n_nodes * n_nodes * (n_nodes + 1) * capacity)
+    fluxes.antidiffusive_flux3_R = unsafe_wrap(Array, pointer(_antidiffusive_flux3_R),
+                                               (n_variables,
+                                                n_nodes, n_nodes, n_nodes + 1,
+                                                capacity))
+
+    uEltype = eltype(fluxes.antidiffusive_flux1_L)
+    @threaded for element in axes(fluxes.antidiffusive_flux1_L, 5)
+        fluxes.antidiffusive_flux1_L[:, 1, :, :, element] .= zero(uEltype)
+        fluxes.antidiffusive_flux1_L[:, n_nodes + 1, :, :, element] .= zero(uEltype)
+        fluxes.antidiffusive_flux1_R[:, 1, :, :, element] .= zero(uEltype)
+        fluxes.antidiffusive_flux1_R[:, n_nodes + 1, :, :, element] .= zero(uEltype)
+
+        fluxes.antidiffusive_flux2_L[:, :, 1, :, element] .= zero(uEltype)
+        fluxes.antidiffusive_flux2_L[:, :, n_nodes + 1, :, element] .= zero(uEltype)
+        fluxes.antidiffusive_flux2_R[:, :, 1, :, element] .= zero(uEltype)
+        fluxes.antidiffusive_flux2_R[:, :, n_nodes + 1, :, element] .= zero(uEltype)
+
+        fluxes.antidiffusive_flux3_L[:, :, :, 1, element] .= zero(uEltype)
+        fluxes.antidiffusive_flux3_L[:, :, :, n_nodes + 1, element] .= zero(uEltype)
+        fluxes.antidiffusive_flux3_R[:, :, :, 1, element] .= zero(uEltype)
+        fluxes.antidiffusive_flux3_R[:, :, :, n_nodes + 1, element] .= zero(uEltype)
+    end
+
+    return nothing
+end
 end # @muladd
diff --git a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
index 11230746494..2cdf23acbbd 100644
--- a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
@@ -68,7 +68,7 @@ end
 # Subcell limiting currently only implemented for certain mesh types
 function calc_volume_integral!(du, u,
                                mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                           P4estMesh{2}},
+                                           P4estMesh{2}, P4estMesh{3}},
                                have_nonconservative_terms, equations,
                                volume_integral::VolumeIntegralSubcellLimiting,
                                dg::DGSEM, cache)
@@ -672,6 +672,14 @@ end
                                          limiter::SubcellLimiterIDP, dg, element, cache)
     @unpack antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R = cache.antidiffusive_fluxes
 
+    # Due to the use of LGL nodes, the DG staggered fluxes `fhat` and FV fluxes `fstar` are equal
+    # on the element interfaces. So, they are not computed in the volume integral and set to zero
+    # in their respective computation.
+    # The antidiffusive fluxes are therefore zero on the element interfaces and don't need to be
+    # computed either. They are set to zero directly after resizing the container.
+    # This applies to the indices `i=1` and `i=nnodes(dg)+1` for `antidiffusive_flux1_L` and
+    # `antidiffusive_flux1_R` and analogously for the second direction.
+
     for j in eachnode(dg), i in 2:nnodes(dg)
         for v in eachvariable(equations)
             antidiffusive_flux1_L[v, i, j, element] = fhat1_L[v, i, j] -
diff --git a/src/solvers/dgsem_tree/subcell_limiters.jl b/src/solvers/dgsem_tree/subcell_limiters.jl
index 6c18a6cf106..efbe2ab7304 100644
--- a/src/solvers/dgsem_tree/subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/subcell_limiters.jl
@@ -52,6 +52,12 @@ where `d = #dimensions`). See equation (20) of Pazner (2020) and equation (30) o
     This limiter and the correction callback [`SubcellLimiterIDPCorrection`](@ref) only work together.
     Without the callback, no correction takes place, leading to a standard low-order FV scheme.
 
+Implementation in 3D:
+In 3D, only the positivity limiter for conservative variables using
+(`positivity_variables_cons`) is implemented and merged for `P4estMesh`.
+`BoundsCheckCallback` is not supported in 3D yet.
+More features will follow soon.
+
 ## References
 
 - Rueda-Ramírez, Pazner, Gassner (2022)
diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl
index cca91aa94b0..29a1ac322fa 100644
--- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl
+++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl
@@ -341,8 +341,8 @@ end
 
             # Compute bound
             if limiter.local_twosided &&
-               variable in limiter.local_twosided_variables_cons &&
-               var_min[i, j, element] >= positivity_correction_factor * var
+               (variable in limiter.local_twosided_variables_cons) &&
+               (var_min[i, j, element] >= positivity_correction_factor * var)
                 # Local limiting is more restrictive that positivity limiting
                 # => Skip positivity limiting for this node
                 continue
diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index db726dc2834..42aba382f89 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -324,6 +324,32 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_euler_sedov_sc_subcell.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_sedov_sc_subcell.jl"),
+                        l2=[
+                            0.16371327294681623,
+                            0.0779033135575244,
+                            0.07790331355752474,
+                            0.07790331355752393,
+                            0.36689870760596666
+                        ],
+                        linf=[
+                            4.210879608740924,
+                            2.881468426664787,
+                            2.8814684266647874,
+                            2.8814684266647856,
+                            6.002617193075404
+                        ],
+                        tspan=(0.0, 0.2),)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # Larger values for allowed allocations due to usage of custom
+    # integrator which are not *recorded* for the methods from
+    # OrdinaryDiffEq.jl
+    # Corresponding issue: https://github.com/trixi-framework/Trixi.jl/issues/1877
+    @test_allocations(Trixi.rhs!, semi, sol, 15_000)
+end
+
 @trixi_testset "elixir_euler_sedov.jl (HLLE)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_sedov.jl"),
                         l2=[