p4est datastructures by koehlerson · Pull Request #780 · Ferrite-FEM/Ferrite.jl

koehlerson · 2023-07-31T15:14:40Z

to be discussed at FerriteCon 23, register now: https://ferrite-fem.github.io/FerriteCon/

TODO

Followup PRs

DofHandler integration
Transfer of solution vector between dof handlers
make it work for arbitrary polynomial order
error estimation
- faster local ZZ error estimator
- Kelly error estimator
- Dörfler marking https://www.ams.org/journals/mcom/2020-89-326/S0025-5718-2020-03553-4/S0025-5718-2020-03553-4.pdf
iterators
- cell
- face
- edge
- vertex
conformization

…nd corner_neighbor need tests

…hanges. should be correct imho

…x corresponds to child_id

…pping

…corner and edge neighbor

termi-official · 2024-06-17T15:56:30Z

Found 2 more issues in 3D with Maxi. Last commit has test coverage for them.

Not all hanging nodes are detected.
For rotated elements the unique node detection fails.

termi-official · 2024-06-17T16:02:09Z

Failing analytical example:

using Ferrite, FerriteGmsh, SparseArrays

const ref_geometry = RefHexahedron
const geometry_type = Hexahedron
const sdim = 3

grid = generate_grid(geometry_type, ntuple(_->4, sdim));

function random_deformation_field(x::Vec{dim}) where dim
    if any(x .≈ -1.0) || any(x .≈ 1.0)
        return x
    else
        Vec{dim}(x .+ (rand(dim).-0.5)*0.1)
    end
end
transform_coordinates!(grid, random_deformation_field)
grid  = ForestBWG(grid,10)

analytical_solution(x) = atan(2*(norm(x)-0.5)/0.02)
analytical_rhs(x) = -laplace(analytical_solution,x)

function assemble_cell!(ke, fe, cellvalues, ue, coords)
    fill!(ke, 0.0)
    fill!(fe, 0.0)

    n_basefuncs = getnbasefunctions(cellvalues)
    for q_point in 1:getnquadpoints(cellvalues)
        x = spatial_coordinate(cellvalues, q_point, coords)
        dΩ = getdetJdV(cellvalues, q_point)
        for i in 1:n_basefuncs
            Nᵢ = shape_value(cellvalues, q_point, i)
            ∇Nᵢ = shape_gradient(cellvalues, q_point, i)
            fe[i] += analytical_rhs(x) * Nᵢ * dΩ
            for j in 1:n_basefuncs
                ∇Nⱼ = shape_gradient(cellvalues, q_point, j)
                ke[i, j] += ∇Nⱼ ⋅ ∇Nᵢ * dΩ
            end
        end
    end
end

function assemble_global!(K, f, a, dh, cellvalues)
    ## Allocate the element stiffness matrix and element force vector
    n_basefuncs = getnbasefunctions(cellvalues)
    ke = zeros(n_basefuncs, n_basefuncs)
    fe = zeros(n_basefuncs)
    ## Create an assembler
    assembler = start_assemble(K, f)
    ## Loop over all cells
    for cell in CellIterator(dh)
        reinit!(cellvalues, cell)
        @views ue = a[celldofs(cell)]
        ## Compute element contribution
        coords = getcoordinates(cell)
        assemble_cell!(ke, fe, cellvalues, ue, coords)
        ## Assemble ke and fe into K and f
        assemble!(assembler, celldofs(cell), ke, fe)
    end
    return K, f
end

function solve(grid)
    dim = 3
    order = 1
    ip = Lagrange{ref_geometry, order}()
    qr = QuadratureRule{ref_geometry}(2)
    cellvalues = CellValues(qr, ip);

    dh = DofHandler(grid)
    add!(dh, :u, ip)
    close!(dh);

    ch = ConstraintHandler(dh)
    add!(ch, ConformityConstraint(:u))
    add!(ch, Dirichlet(:u, getfacetset(grid, "top"), (x, t) -> 0.0))
    add!(ch, Dirichlet(:u, getfacetset(grid, "right"), (x, t) -> 0.0))
    add!(ch, Dirichlet(:u, getfacetset(grid, "left"), (x, t) -> 0.0))
    add!(ch, Dirichlet(:u, getfacetset(grid, "bottom"), (x, t) -> 0.0))
    close!(ch);

    K = create_sparsity_pattern(dh,ch)
    f = zeros(ndofs(dh))
    a = zeros(ndofs(dh))
    assemble_global!(K, f, a, dh, cellvalues);
    apply!(K, f, ch)
    u = K \ f;
    apply!(u,ch)
    return u,dh,ch,cellvalues
end

function solve_adaptive(initial_grid::ForestBWG{sdim}) where sdim
    ip = Lagrange{ref_geometry, 1}()
    qr_sc = QuadratureRule{ref_geometry}(2)
    cellvalues = CellValues(qr_sc, ip);
    finished = false
    i = 1
    grid = deepcopy(initial_grid)
    pvd = VTKFileCollection("heat_amr.pvd",grid);
    while !finished && i<=10
        @show i
        transfered_grid = Ferrite.creategrid(grid)
        u,dh,ch,cv = solve(transfered_grid)
        # σ_gp, σ_gp_sc = compute_fluxes(u,dh)
        # projector = L2Projector(Lagrange{ref_geometry, 1}(), transfered_grid)
        # σ_dof = project(projector, σ_gp, QuadratureRule{ref_geometry}(2))
        cells_to_refine = Int[]
        error_arr = Float64[]
        for (cellid,cell) in enumerate(CellIterator(dh))
            reinit!(cellvalues, cell)
            @views ue = u[celldofs(cell)]
            error = 0.0
            for q_point in 1:getnquadpoints(cellvalues)
                x = spatial_coordinate(cv, q_point, getcoordinates(cell))
                dΩ = getdetJdV(cellvalues,q_point)
                error += abs(analytical_solution(x) - function_value(cellvalues, q_point, ue)) * dΩ
            end
            if error > 0.001
                push!(cells_to_refine,cellid)
            end
            push!(error_arr,error)
        end

        addstep!(pvd, i, dh) do vtk
            write_solution(vtk, dh, u)
            # write_projection(vtk, projector, σ_dof, "flux")
            # write_cell_data(vtk, getindex.(collect(Iterators.flatten(σ_gp_sc)),1), "flux sc x")
            # write_cell_data(vtk, getindex.(collect(Iterators.flatten(σ_gp_sc)),2), "flux sc y")
            write_cell_data(vtk, error_arr, "error")
        end

        Ferrite.refine!(grid, cells_to_refine)
        Ferrite.balanceforest!(grid)

        i += 1
        if isempty(cells_to_refine)
            finished = true
        end
    end
    close(pvd);
    transfered_grid = Ferrite.creategrid(grid)
    u,dh,ch,cv = solve(transfered_grid)
end

u,dh,ch,cv = solve_adaptive(grid)

bplcn · 2024-12-13T15:01:45Z

src/Adaptivity/constraints.jl

+end
+
+function Ferrite.add!(ch::ConstraintHandler{<:DofHandler{<:Any,<:NonConformingGrid}}, cc::ConformityConstraint)
+    @assert length(ch.dh.field_names) == 1 "Multiple fields not supported yet."


For a dofhandler with two scalar fields, the present function can apply the conformity constraint for hanging nodes. So I think that the assert can be replaced with a warning.

Suggested change

@assert length(ch.dh.field_names) == 1 "Multiple fields not supported yet."

@assert length(ch.dh.field_names) == 1 "Multiple fields not supported yet."

Four bugs in the inter-octree branch of hangingnodes(): - Use pface_i directly for facet_neighborhood lookup instead of looping over rootfaces with ri - Check any rootface contains the parent face, not just one - Pass pface_i (not ri) to rotation_permutation for correct 3D face rotation - Compare vertex index c' against ci (integer) not c (coordinate tuple), and use parent edges instead of interoctree neighbor edges for edge hanging node detection Updates test expectations from 58 to 59 conformity entries for both the combined and combined+rotated test cases. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

- Manufactured solution: Gaussian ring exp(-((|x|-0.5)/ε)²) with ε=0.02, naturally zero at boundary so homogeneous Dirichlet BCs are correct - ZZ error estimator: L2-project raw flux to smooth nodal field, compare recovered vs raw flux per cell - Dörfler marking: mark smallest cell set accounting for θ=0.5 of total error, replacing the previous absolute threshold - CG solver (IterativeSolvers.jl) instead of direct solve for scalability - Add Literate.jl comments explaining the tutorial - Enable initial uniform refinement in elasticity adaptivity example Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

…and update adaptive heat for doc build

docs/src/literate-tutorials/heat_adaptivity.jl

koehlerson · 2026-02-12T12:36:50Z

3D seems to work now, need an additional real life test for rotated elements or maybe adjust the example here: https://ferrite-fem.github.io/Ferrite.jl/previews/PR780/tutorials/heat_adaptivity/

and rotate a few elements before the adaptive loop begins

termi-official · 2026-02-12T15:42:03Z

docs/src/literate-tutorials/heat_adaptivity.jl

+        u, dh, ch, cv = solve(transfered_grid)
+
+        ## Step 1: Compute the raw FE flux σ_h = ∇u_h at each quadrature point
+        σ_gp = Vector{Vector{Vec{3, Float64}}}()


Can we use a flat vector here? This is not really performant.

docs/src/literate-tutorials/heat_adaptivity.jl

termi-official · 2026-02-12T15:44:17Z

docs/src/literate-tutorials/heat_adaptivity.png

TODO dump into Ferrite.asset_url before merging

termi-official · 2026-02-12T15:44:44Z

docs/src/literate-tutorials/l.msh

Is this intended to be dumped in addition to the geo file?

termi-official · 2026-02-12T15:47:41Z

src/Dofs/DofHandler.jl

+        dh.vertexdicts[fi] = zeros(Int, nnodes)
+        dh.edgedicts[fi] = Dict{Tuple{Int, Int}, Int}()
+        dh.facedicts[fi] = Dict{NTuple{3, Int}, Int}()


Can't we empty!?

termi-official · 2026-02-12T15:49:12Z

src/Dofs/DofHandler.jl

+    # Maps from entity to dofs
+    # `vertexdict` keeps track of the visited vertices. The first dof added to vertex v is
+    # stored in vertexdict[v].
+    vertexdicts::Vector{Vector{Int}}
+    # `edgedict` keeps track of the visited edges, this will only be used for a 3D problem.
+    # An edge is uniquely determined by two global vertices, with global direction going
+    # from low to high vertex number.
+    edgedicts::Vector{Dict{Tuple{Int, Int}, Int}}
+    # `facedict` keeps track of the visited faces. We only need to store the first dof we
+    # add to the face since currently more dofs per face isn't supported. In
+    # 2D a face (i.e. a line) is uniquely determined by 2 vertices, and in 3D a face (i.e. a
+    # surface) is uniquely determined by 3 vertices.
+    facedicts::Vector{Dict{NTuple{3, Int}, Int}}


Should we add a second dof handler type to not strain problems not using AMR?

So new struct with the same fields and then use the default dispatches und AbstractDofHandler and later if intervention is needed dispatch on the new type?

What about having an additional field collecting all of these in a special struct? Then that internal struct can be documented in devdocs with what is now described in the code comments, which can help explaining the dof-distribution better as well?

That field can be nothing or the content can be empty if not used for the regular cases?

But how would we proceed with that structure? It could be needed in any code that handles the DofHandler. It's difficult for me to imagine an interface where this struct is handed over whenever it's needed, so that it can be reused for other cases. Don't you agree?

Ideally, the ForestBWG would provide its own iterators for volumes, facets, edges and nodes, which would then be used optimally for the tailored DofHandler.

I was more thinking that for the dof-distribution, we could refactor the dicts by introducing

struct EntityDicts vertices::Vector{Vector{Int}} edges::Vector{Dict{NTuple{2, Int}, Int}} faces::Vector{Dict{NTuple{3, Int}, Int}} end

which potentially makes it easier to understand the dof-distribution. For this PR, it would help by reducing the extra fields in the dofhandler, by only adding the field entitydicts::EntityDicts, changing the DofHandler.jl#L364-L395 code to

numfields = length(dh.field_names) + resize!(dh.entitydicts.vertices, numfields) + resize!(dh.entitydicts.edges, numfields) + resize!(dh.entitydicts.faces, numfields) - # NOTE: Maybe it makes sense to store *Index in the dicts instead. - - # `vertexdict` keeps track of the visited vertices. The first dof added to vertex v is - # stored in vertexdict[v]. - # TODO: No need to allocate this vector for fields that don't have vertex dofs - vertexdicts = [zeros(Int, getnnodes(get_grid(dh))) for _ in 1:numfields] - - # `edgedict` keeps track of the visited edges. - # An edge is uniquely determined by two global vertices, with global direction going - # from low to high vertex node number, see sortedge - edgedicts = [Dict{NTuple{2, Int}, Int}() for _ in 1:numfields] - - # `facedict` keeps track of the visited faces. We only need to store the first dof -we - # add to the face since currently more dofs per face isn't supported. - # A face is uniquely determined by 3 vertex nodes, see sortface - facedicts = [Dict{NTuple{3, Int}, Int}() for _ in 1:numfields] # Set initial values nextdof = 1 # next free dof to distribute @debug println("\n\nCreating dofs\n") for (sdhi, sdh) in pairs(dh.subdofhandlers) nextdof = _close_subdofhandler!( dh, sdh, sdhi, # TODO: Store in the SubDofHandler? nextdof, + dh.entitydicts, - vertexdicts, - edgedicts, - facedicts,

But if more things should be different, perhaps just having a

struct ForestBWGDofHandler{DH <: DofHandler, ...} <: AbstractDofHandler basedh::DH ... end

is easier and then just forwarding all methods that don't change to basedh?

termi-official · 2026-02-12T15:50:11Z

src/Grid/grid_generators.jl

    return Grid(cells, nodes, facetsets = facetsets)
 end
+
+function generate_simple_disc_grid(::Type{Quadrilateral}, n; radius = 1.0)


Is this used somewhere in this PR?

KnutAM · 2026-02-12T22:09:37Z

src/L2_projection.jl

+    end
+    M = _assemble_L2_matrix(proj.dh, proj.ch, proj.qrs_lhs)
+    if proj.ch !== nothing
+        apply!(M.data, proj.ch)


Can't we apply! directly on the Symmetric matrix?

KnutAM · 2026-02-12T22:11:00Z

src/L2_projection.jl

+function _assemble_L2_matrix(dh::DofHandler, ch::ConstraintHandler, qrs_lhs::Vector{<:QuadratureRule})
+    M = Symmetric(allocate_matrix(dh, ch))


Should work for both cases?

Suggested change

function _assemble_L2_matrix(dh::DofHandler, ch::ConstraintHandler, qrs_lhs::Vector{<:QuadratureRule})

M = Symmetric(allocate_matrix(dh, ch))

function _assemble_L2_matrix(dh::DofHandler, ch::Union{ConstraintHandler, Nothing}, qrs_lhs::Vector{<:QuadratureRule})

M = Symmetric(allocate_matrix(dh, ch))

Or alternatively allocate_matrix outside and make mutating to avoid having to pass the constraint handler to this function?

KnutAM · 2026-02-12T22:31:35Z

src/L2_projection.jl

+        for (i, col) in enumerate(eachcol(f))
+            apply!(col, ch)
+            u = proj.M_cholesky \ col
+            apply!(u, ch)
+            projected_vals[:, i] = u
+        end


Maybe microoptimization, but we could avoid allocating twice by (probably requires using LinearAlgebra: ldiv!)

Suggested change

for (i, col) in enumerate(eachcol(f))

apply!(col, ch)

u = proj.M_cholesky \ col

apply!(u, ch)

projected_vals[:, i] = u

end

for (f_i, u_i) in zip(eachcol(f), eachcol(projected_vals))

apply!(f_i, ch)

ldiv!(u_i, proj.M_cholesky, f_i)

apply!(u_i, ch)

end

KnutAM

Cool to see this PR alive again 🚀

koehlerson · 2026-02-13T09:16:20Z

I think a roadmap would be nice before merging, there are multiple things that can be adressed

the very vanilla ZZ estimator from the example could be (I think) abstracted by a struct where a callback function is dispatched as interface with varargs in order to evaluate the fluxes after having the solution's gradient for arbitrary flux/stress computations
the dörfler marking could be abstracted with an interplay to the ZZ estimator or simply by an error array
balanceforest needs a refactoring because I implemented it very crude from the pseudo algorithm of the paper
currently its restricted for linear interpolations, I cannot estimate how complicate it would be to advance it for arbitrary orders of function/geometric interpolation
Iterator approach by binary search for visiting uniquely volume, facets, edges, nodes and thereby getting the hanging nodes information could be started (but I believe this is a huge working package, even though claude code could be very helpful after writing a starting point of the implementation that uses the already implemented building blocks)

What do you think should be addressed beside the review points you guys made already?

termi-official · 2026-02-13T11:36:35Z

What do you think should be addressed beside the review points you guys made already?

In the original ZZ paper they also suggest to lump the mass matrix for the projection. This should give a nice performance boost. Furthermore our error is not normalized yet in the paper (also described in the paper), so we might want to address this, too.
We can also leave the threshold based refinement. At least for my problems this worked better than Dörfler marking.
Another thing is that we want to check if that the estimators really work in terms of convergence towards the analytical solution (beyond the eyeball norm)
https://arxiv.org/abs/1907.13078 Claude when
Even if not everything is perfect yet some benchmarks would be good, so we know at least where we are.

currently its restricted for linear interpolations, I cannot estimate how complicate it would be to advance it for arbitrary orders of function/geometric interpolation

I also thought already about whether the adaptive dof handler should have another field for the conformity constraints. This would also pave the way for p-adaptivity.

koehlerson · 2026-02-18T15:43:38Z

AMR Performance Analysis and Optimization Plan

Benchmark Results Summary

Tested on Julia 1.11.4, 2026-02-11. Uniformly refined grids with 25% additional adaptive refinement.

Time Breakdown (Largest Cases)

2D (16x16 base, 3 levels, 28672 cells):

Phase	Time	%
Forest creation	0.12 ms	0.1%
Uniform refinement	3.84 ms	1.9%
Adaptive refinement	4.27 ms	2.2%
Balance (per-tree)	48.49 ms	24.5%
Balance (forest)	75.23 ms	38.0%
creategrid	66.24 ms	33.4%
TOTAL	198.19 ms

3D (8x8x8 base, 2 levels, 90112 cells):

Phase	Time	%
Forest creation	0.75 ms	0.0%
Uniform refinement	5.56 ms	0.1%
Adaptive refinement	13.84 ms	0.3%
Balance (per-tree)	154.09 ms	2.9%
Balance (forest)	742.65 ms	14.1%
creategrid	4365 ms	82.6%
TOTAL	5282 ms

creategrid Phase Breakdown (3D, 11264 cells)

Phase	Time	Notes
Phase 1: Intra-octree nodes	13.6 ms	90k node entries -> 17.6k unique keys
Phase 2: Inter-octree merge	29.1 ms	Dict-heavy, many lookups
Phase 3: Deduplication	7.9 ms
Phase 4: Coord transform	11.6 ms	`transform_pointBWG` per node
Phase 5: Hanging nodes	533.7 ms	89% of creategrid! Only 800 constraints
Phase 6: Facetset reconstruct	2.8 ms

balancetree Internal Breakdown (3D, 512 leaves/tree)

Operation	Time	%	Notes
Neighbors+siblings	548.6 us	68.9%	`possibleneighbors` + `siblings`
Parent checks	131.0 us	16.4%	`p not in parent.(T, b)` -- O(n^2)
Final sort	70.8 us	8.9%
Sorting Q	14.3 us	1.8%
Filter/unique/W	22.3 us	2.8%
Linearise	9.6 us	1.2%

Identified Bottlenecks (Priority Order)

1. `hangingnodes()` -- THE dominant bottleneck (89% of creategrid in 3D)

Root cause: O(n^2) or worse algorithmic complexity.

For every leaf (11264), for every vertex (8 in 3D), for every parent face (6), the code:

Calls iscenter(c, pface) -- coordinate comparison, fast
Calls facet_neighbor(parent_, pface_i, tree.b) -- fine
Does findfirst(x -> x == neighbor_candidate, tree.leaves) -- O(n) linear scan on the leaves vector!
For inter-octree: does another findfirst on the neighbor tree's leaves

This means for each hanging node candidate: O(leaves_per_tree) scan. With ~180 leaves/tree and 8x6=48 vertex-face pairs checked per leaf, that's millions of comparisons.

Additionally in 3D: edge hanging node detection does nested loops with findfirst on edges, adding further O(n) scans.

2. `balanceforest!()` -- Convergence loop with repeated `balancetree`

Root cause: The while loop while nrefcells - getncells(forest) != 0 rebuilds the entire balanced tree every iteration, even when only a few inter-octree refinements happened.

Inside the loop:

balancetree(tree) is called for ALL trees every iteration (even unchanged ones)
possibleneighbors is called for ALL leaves every iteration

in checks in balance_face/balance_corner/balance_edge use linear scan on tree.leaves:

s' not in neighbor_tree.leaves  # O(n) linear scan!
parent(s', b) not in neighbor_tree.leaves  # O(n) linear scan!
parent(parent(s', b), b) in neighbor_tree.leaves  # O(n) linear scan!

3. `balancetree()` -- O(n^2) parent check

Root cause: Line 1102:

if p not in parent.(T, (tree.b,))

This broadcasts parent() over the entire T array for EVERY element in Q. Since |Q| and |T| grow proportionally, this is O(n^2) in the number of leaves at each level.

4. `creategrid` Phase 2 -- Inter-octree node merging

Root cause: For every leaf on a tree boundary, iterates over all leaves to find face matches. The contains_facet check inside nested loops creates O(trees x leaves_per_tree x faces) work.

5. Dict overhead in `creategrid`

Uses Dict{Tuple{Int, NTuple{dim, Int32}}, Int} as the primary data structure. Dict operations on tuple keys have non-trivial hash costs. With 90k entries, this adds up.

The IBWG2015 Topology Iterator Approach

Background

The current creategrid function (BWG.jl:598) carries a TODO comment:

#TODO: this function should wrap the LNodes Iterator of [IBWG2015](@citet)

The paper "Recursive Algorithms for Distributed Forests of Octrees" (Isaac, Burstedde, Wilcox, Ghattas, SIAM J. Sci. Comput. 37(5), 2015) proposes a fundamentally different approach to forest traversal that would replace the current ad-hoc multi-phase pipeline with a single, unified recursive descent over the forest.

What the paper proposes

The paper introduces three key algorithms:

Parallel multi-item search (Algorithm 3.1, using split_array from Algorithm 3.3): A recursive top-down search that begins at the root of each octree and descends through children, using split_array to efficiently partition the sorted leaf array at each level. Instead of scanning leaves linearly, it recursively narrows down to the relevant sub-array in O(log n) steps.
Ghost layer construction (Section 4): Constructs the layer of remote octants needed for inter-process communication. Not needed for serial, but the recursive descent pattern is reusable.
Topology iterator (p4est_iterate, Section 5): A universal iterator that visits every volume (cell), face, edge, and corner in the forest exactly once, executing user-supplied callbacks at each. The key insight: it simultaneously descends through multiple octrees that share a topological feature (face/edge/corner), using split_array on each, and determines hanging status naturally from the level mismatch during descent.

How the topology iterator works

The iterator performs a coordinated recursive descent through all octrees simultaneously:

iterate(forest):
  for each tree k:
    iter_volume(root, tree_k.leaves)           # descend into volumes
  for each inter-tree face (k, k'):
    iter_face(root_k, root_k', leaves_k, leaves_k')  # descend both trees
  for each inter-tree edge (k, k', ...):
    iter_edge(roots, leaves_arrays)
  for each inter-tree corner (k, k', ...):
    iter_corner(roots, leaves_arrays)

iter_volume(octant, leaf_subarray):
  if |leaf_subarray| == 1 and leaf_subarray[0] == octant:
    CALL volume_callback(octant)               # leaf reached
    return
  children = split_array(leaf_subarray, octant)  # O(log n) split
  for each child c:
    iter_volume(c, children[c])
  for each intra-tree face between children:
    iter_face(child_i, child_j, children[i], children[j])
  for each intra-tree edge between children:
    iter_edge(...)
  for each intra-tree corner between children:
    iter_corner(...)

iter_face(octant_left, octant_right, leaves_L, leaves_R):
  if both sides are single leaves:
    CALL face_callback(left, right, is_hanging=false)
    return
  if one side is a leaf and other isn't:
    CALL face_callback(leaf, children, is_hanging=true)
    # the hanging status falls out naturally from the level mismatch!
    return
  # both sides need further refinement:
  split both sides, recurse into matching children pairs

The critical properties:

O(n) total work: Each leaf is visited O(1) times per topological feature.
split_array provides O(log n) descent per level, amortized O(1) per leaf.
Hanging nodes are detected for free during the descent: when one side of a face is a leaf and the other side has children, the face is hanging. No separate hangingnodes() pass needed.
Node numbering happens in callbacks: The volume callback assigns interior nodes, the face callback assigns face nodes (and detects hanging), the edge callback assigns edge nodes, the corner callback assigns corner nodes. This is exactly LNodes.
Inter-octree boundaries are handled by the top-level face/edge/corner iterations that coordinate descent across multiple trees with orientation transforms.

What's already implemented from IBWG2015 in BWG.jl

The current code has several building blocks from the paper already in place:

Component	Status	Location	Notes
`OctantBWG` ordering (Alg 2.1)	Done	BWG.jl:107	Morton-based `isless`
`split_array` (Alg 3.3)	Done	BWG.jl:367	Binary search partitioning of leaf arrays
`search` (Alg 3.1)	Partial	BWG.jl:390	Framework exists but `match` callback is a stub (has `println` debug, always returns false)
`find_range_boundaries` (Alg 4.2)	Done	BWG.jl:261	Recursive boundary identification, but no tests
`boundaryset` (Fig 4.1)	Done	BWG.jl:214-253	2D and 3D boundary tables
`isrelevant` (Alg 5.1)	Stub	BWG.jl:296	Always returns true (serial-only placeholder)
`ancestor_id` (Alg 3.2)	Done	BWG.jl:1289	Generalized child_id for arbitrary levels
`descendants`	Done	BWG.jl:1317	First/last descendant computation
Inter-tree transforms	Done	BWG.jl:1508-1734	`transform_facet`, `transform_edge`, `transform_corner`
Topology iterator	Not started	--	The core `iter_volume`/`iter_face`/`iter_edge`/`iter_corner` recursive descent
LNodes	Not started	--	Node numbering via iterator callbacks

Why the iterator approach is the right path forward

The current implementation has a fundamental architectural problem: it uses flat iteration over all leaves for every operation (node assignment, inter-octree merging, hanging node detection, facetset reconstruction). This leads to:

O(n) linear scans where O(log n) descent would suffice (the findfirst calls in hangingnodes)
Redundant work: the same topological relationships are rediscovered independently in each phase of creategrid
Complex inter-octree logic duplicated across creategrid Phase 2, hangingnodes, balanceforest!, and reconstruct_facetsets -- each has its own face/edge/corner neighbor iteration logic
No reuse of the sorted structure: leaves are sorted in Morton order, but this is barely exploited (only in split_array, which is implemented but unused in the hot paths)

The IBWG2015 topology iterator would solve all of these simultaneously:

Current Problem	Iterator Solution
`hangingnodes` O(n^2) scans	Hanging status detected for free during face/edge descent
`creategrid` 5 separate phases	Single pass: callbacks assign nodes + detect hanging + build cells
Duplicated inter-octree transform logic	Inter-tree face/edge/corner descent handles transforms once
`balanceforest!` repeated full rebalance	Could use iterator for targeted neighbor checking
`reconstruct_facetsets` separate pass	Face callback naturally identifies boundary faces
`findfirst` on unsorted leaf arrays	`split_array` gives O(log n) access to any sub-range

Argument: Iterator vs. quick fixes

There are two paths forward:

Path A: Quick fixes (Phases 1-3 from the optimization plan above)

Replace findfirst with searchsortedfirst in ~10 locations
Add Set for parent checks in balancetree
Effort: ~1-2 days. Speedup: ~20-50x on the worst cases.
Limitation: Still has the architectural problem of multiple passes, duplicated logic, and O(n log n) instead of O(n) for the overall pipeline.

Path B: Implement the IBWG2015 topology iterator

Build forest_iterate(forest, volume_cb, face_cb, edge_cb, corner_cb)
Rewrite creategrid as a set of callbacks (LNodes pattern)
Effort: ~2-4 weeks. Speedup: Asymptotically optimal O(n), plus cleaner architecture.
Benefit: The iterator becomes the single traversal primitive for everything -- grid creation, hanging node detection, error estimation, solution transfer, load balancing, visualization. Every future feature benefits.

Recommendation: Do both, in order. Apply the quick fixes (Path A) first to get immediate relief -- the searchsortedfirst changes are mechanical and safe. Then implement the iterator (Path B) as the long-term solution. The quick fixes will also serve as a correctness reference: you can verify the iterator produces identical results.

Detailed Plan for the Topology Iterator Implementation

Step 1: Core `forest_iterate` skeleton

Implement the recursive descent framework:

struct ForestIterateCallbacks{V,F,E,C}
    volume::V   # (info::VolumeInfo) -> nothing
    face::F     # (info::FaceInfo) -> nothing
    edge::E     # (info::EdgeInfo) -> nothing  (3D only)
    corner::C   # (info::CornerInfo) -> nothing
end

struct VolumeInfo{dim,N,T}
    treeid::Int
    quadrant::OctantBWG{dim,N,T}
    leafid::Int  # index into flattened leaf array
end

struct FaceSide{dim,N,T}
    treeid::Int
    face_local::Int
    is_hanging::Bool
    # if !is_hanging: single octant
    # if is_hanging: 2^(dim-1) smaller octants on the hanging side
    quadrants::Union{
        Tuple{OctantBWG{dim,N,T}},           # full (non-hanging)
        NTuple{M,OctantBWG{dim,N,T}} where M # hanging (2 in 2D, 4 in 3D)
    }
    leafids::Vector{Int}
end

struct FaceInfo{dim,N,T}
    tree_boundary::Bool
    orientation::Int
    sides::Tuple{FaceSide{dim,N,T}, FaceSide{dim,N,T}}
end

# Similarly for EdgeInfo (3D) and CornerInfo

function forest_iterate(forest::ForestBWG{dim}, callbacks) where {dim}
    # Phase 1: Volume + intra-tree interfaces
    leaf_offset = 0
    for (k, tree) in enumerate(forest.cells)
        _iter_volume(root(dim), tree.leaves, 1, length(tree.leaves),
                     k, tree.b, leaf_offset, callbacks)
        leaf_offset += length(tree.leaves)
    end

    # Phase 2: Inter-tree faces
    facet_neighborhood = get_facet_facet_neighborhood(forest)
    for k in 1:length(forest.cells)
        for f in 1:(2*dim)
            # ... setup face iteration between trees k and k'
            # ... call _iter_face with leaves from both trees
        end
    end

    # Phase 3: Inter-tree edges (3D only)
    # Phase 4: Inter-tree corners
end

Step 2: Recursive volume descent with `split_array`

function _iter_volume(octant, leaves, lo, hi, treeid, b, leaf_offset, callbacks)
    n = hi - lo + 1
    if n == 0
        return
    end
    if n == 1 && leaves[lo] == octant
        # Leaf reached -- call volume callback
        callbacks.volume(VolumeInfo(treeid, octant, leaf_offset + lo))
        return
    end
    # Split leaves among children using split_array
    children_ranges = _split_ranges(leaves, lo, hi, octant, b)
    child_octants = children(octant, b)

    for (ci, (clo, chi)) in enumerate(children_ranges)
        _iter_volume(child_octants[ci], leaves, clo, chi, treeid, b, leaf_offset, callbacks)
    end

    # Intra-octant face interfaces between children
    for (fi, (ci, cj)) in enumerate(CHILD_FACE_PAIRS[dim])
        _iter_face(child_octants[ci], child_octants[cj],
                   leaves, children_ranges[ci], children_ranges[cj],
                   treeid, treeid, b, b, fi, false, 0, callbacks)
    end

    # Intra-octant edges (3D) and corners between children
    # ...
end

Step 3: Face descent with hanging detection

function _iter_face(oct_L, oct_R, leaves, (lo_L, hi_L), (lo_R, hi_R),
                    tree_L, tree_R, b_L, b_R, face_idx, tree_boundary, orientation,
                    callbacks)
    n_L = hi_L - lo_L + 1
    n_R = hi_R - lo_R + 1

    is_leaf_L = (n_L == 1 && leaves_L[lo_L] == oct_L)  # left side is a single leaf
    is_leaf_R = (n_R == 1 && leaves_R[lo_R] == oct_R)  # right side is a single leaf

    if is_leaf_L && is_leaf_R
        # Conforming face -- both sides are leaves at same level
        callbacks.face(FaceInfo(tree_boundary, orientation,
            (FaceSide(tree_L, ..., false, ...), FaceSide(tree_R, ..., false, ...))))
        return
    end

    if is_leaf_L && !is_leaf_R
        # HANGING face: left is coarser, right has 2^(dim-1) children on the face
        # The children on the right that touch the face are exactly the hanging nodes!
        callbacks.face(FaceInfo(tree_boundary, orientation,
            (FaceSide(tree_L, ..., false, ...), FaceSide(tree_R, ..., true, ...))))
        return
    end

    # ... symmetric case for is_leaf_R ...

    # Neither side is a leaf -- descend both sides
    # Split and recurse into matching child pairs
end

Step 4: LNodes as iterator callbacks

mutable struct LNodesState{dim}
    next_nodeid::Int
    cell_nodes::Vector{NTuple{N,Int}} where N  # node ids per cell
    hanging_nodes::Dict{Int, Vector{Int}}       # constrained -> constrainers
    node_coordinates::Vector{Vec{dim,Float64}}
end

function lnodes_volume_callback(info::VolumeInfo, state::LNodesState)
    # Assign interior node ids for this leaf
    # (For linear elements, vertices are assigned here)
end

function lnodes_face_callback(info::FaceInfo, state::LNodesState)
    # If conforming: merge shared face nodes between the two sides
    # If hanging: record hanging node constraints
    #   The hanging nodes on the finer side are constrained by
    #   the face nodes on the coarser side
end

function lnodes_edge_callback(info::EdgeInfo, state::LNodesState)
    # Same pattern for edges
end

function lnodes_corner_callback(info::CornerInfo, state::LNodesState)
    # Same pattern for corners
end

Step 5: Rewrite `creategrid` using the iterator

function creategrid(forest::ForestBWG{dim}) where {dim}
    state = LNodesState(...)
    forest_iterate(forest, ForestIterateCallbacks(
        (info) -> lnodes_volume_callback(info, state),
        (info) -> lnodes_face_callback(info, state),
        (info) -> lnodes_edge_callback(info, state),
        (info) -> lnodes_corner_callback(info, state),
    ))
    # Convert state into NonConformingGrid
    return NonConformingGrid(state.cells, state.nodes, ...,
                             conformity_info = state.hanging_nodes)
end

Summary of implementation status and what to build

ALREADY IMPLEMENTED (building blocks):
  [x] OctantBWG with Morton ordering
  [x] split_array (Algorithm 3.3) -- efficient O(log n) leaf partitioning
  [x] search framework (Algorithm 3.1) -- but match callback is a stub
  [x] find_range_boundaries (Algorithm 4.2) -- untested
  [x] boundaryset (Fig 4.1) -- determines which faces/edges/corners an octant touches
  [x] ancestor_id (Algorithm 3.2)
  [x] descendants, children, parent, siblings
  [x] Inter-tree transforms (transform_facet, transform_edge, transform_corner)
  [x] Orientation computation (compute_face_orientation, compute_edge_orientation)

TO BE IMPLEMENTED:
  [ ] forest_iterate -- top-level orchestration (intra-tree + inter-tree)
  [ ] _iter_volume -- recursive volume descent using split_array
  [ ] _iter_face -- recursive face descent with hanging detection
  [ ] _iter_edge -- recursive edge descent (3D)
  [ ] _iter_corner -- corner callback dispatch
  [ ] Info structs (VolumeInfo, FaceInfo, EdgeInfo, CornerInfo)
  [ ] CHILD_FACE_PAIRS / CHILD_EDGE_PAIRS tables (which children share a face/edge)
  [ ] LNodes callbacks (volume, face, edge, corner)
  [ ] Rewritten creategrid using the iterator
  [ ] Tests comparing old creategrid output with new iterator-based output

Quick-Fix Optimization Plan (Do First)

These are mechanical fixes that give immediate speedup while the iterator is being developed.

Phase 1: Fix `hangingnodes()` -- Expected 10-50x speedup on creategrid

Strategy: Replace findfirst(x -> x == candidate, tree.leaves) with O(log n) binary search.

Since tree.leaves is maintained in Morton order (sorted), we can use searchsortedfirst:

# BEFORE (O(n)):
neighbor_candidate_idx = findfirst(x -> x == neighbor_candidate, tree.leaves)

# AFTER (O(log n)):
idx = searchsortedfirst(tree.leaves, neighbor_candidate)
neighbor_candidate_idx = (idx <= length(tree.leaves) && tree.leaves[idx] == neighbor_candidate) ? idx : nothing

Phase 2: Fix `balance_face/corner/edge` -- Expected 5-10x speedup on balanceforest

Strategy: Replace in tree.leaves linear scans with binary search.

function leaf_exists(tree, octant)
    idx = searchsortedfirst(tree.leaves, octant)
    return idx <= length(tree.leaves) && tree.leaves[idx] == octant
end

Phase 3: Fix `balancetree()` parent check -- Expected 2-5x speedup on balancetree

Strategy: Replace O(n^2) p not in parent.(T, b) with a Set:

parent_set = Set{typeof(x)}()
for x in Q
    p = parent(x, tree.b)
    if p not in parent_set
        push!(T_set, x)
        push!(parent_set, p)
    end
end

Phase 4: Dirty tree tracking in `balanceforest!()`

Track which trees were modified and only re-balance those.

Expected Impact

Optimization	Target	Expected Speedup
Quick fix: Binary search in hangingnodes	creategrid 3D	10-50x
Quick fix: Binary search in balance_*	balanceforest	5-10x
Quick fix: Set-based parent check	balancetree	2-5x
Quick fix: Dirty tree tracking	balanceforest	2-3x
Iterator (long-term)	Everything	Asymptotically optimal O(n)

Quick fixes estimated overall speedup for 3D: 20-100x (dominated by hangingnodes fix).

Iterator: replaces the entire multi-pass architecture with a single O(n) traversal, eliminates all linear scans, and provides a reusable primitive for future features (error estimation, solution transfer, load balancing, parallel ghost layer, etc.).

How to Run the Benchmark

julia --project --startup-file=no -e '
    using Dates
    include("benchmark/benchmarks-amr.jl")
    run_all_benchmarks()
'

For quick profiling of a specific component:

using Ferrite, Ferrite.AMR
include("benchmark/benchmarks-amr.jl")
forest = create_refined_forest(Val(3), 4, 2; max_level=8)
profile_creategrid(forest)
profile_balancetree(forest)

References

[IBWG2015] Isaac, Burstedde, Wilcox, Ghattas. "Recursive Algorithms for Distributed Forests of Octrees." SIAM J. Sci. Comput. 37(5), C497-C531, 2015.
[BWG2011] Burstedde, Wilcox, Ghattas. "p4est: Scalable Algorithms for Parallel Adaptive Mesh Refinement on Forests of Octrees." SIAM J. Sci. Comput. 33(3), 2011.
[SSB2008] Sundar, Sampath, Biros. "Bottom-up construction and 2:1 balance refinement of linear octrees in parallel." 2008.

KnutAM · 2026-02-18T19:21:42Z

AMR Performance Analysis and Optimization Plan

Very cool and detailed analysis! 🤖

I think a roadmap would be nice before merging,

Regarding the roadmap, to me it seems like a key point is having the interface for mesh refinement and test the p4est-refinement. Personally, considering the magnitude of this PR, I think the key focus should be adding sufficient tests to check for correctness including all the corner-cases that I understand exists?

And before spending too much time on optimizing the performance (as long as it is usable for testing purposes and that refining doesn't take more than say 20x the linear solve), I think it could make sense to merge it and document it as experimental, to allow gaining some experience if the interface works well?

KnutAM · 2026-02-18T19:56:20Z

src/Adaptivity/ncgrid.jl

+- `conformity_info::CIT`: a container for conformity information
+- `boundary_matrix::SparseMatrixCSC{Bool,Int}`: optional, only needed by `onboundary` to check if a cell is on the boundary, see, e.g. Helmholtz example
+"""
+mutable struct NonConformingGrid{dim,C<:Ferrite.AbstractCell,T<:Real,CIT} <: Ferrite.AbstractGrid{dim}


As I understand this, compared to the current Grid, this allows

Storing the conformity_info

Dispatching when adding ConformityConstraint() to give error if a Grid

But if we want this to work more general (e.g. with different interpolations on different parts of the domain), shouldn't the conformity information belong to the dofhandler instead? And then have a field in the grid for denoting hanging_nodes with the required information to create the conformity_information when adding fields to the dofhandler? Having just hanging_nodes as an additional field seems quite clear to me and avoids having an extra grid datastructure, and then we can have the general case supported in the DofHandler even when the grid is conforming but the added interpolations don't.

Just some cases in my head as reference when thinking

a) Tie constraint between 3 bilinear (possible with `NonConformingGrid`) o ---- o ---- o | | | | o ---- o | | | o ---- o ---- o b) Tie constraint between 2 different order elements (not possible with `NonConformingGrid`) o ---- o -- o -- o | | | | o o o | | | o ---- o -- o -- o c) Tie constraints between 3 different order elements (case 1) o ---- o -- o -- o | | | o ---- o o o | | | o ---- o -- o -- o # Actually invalid - would be nonconforming even if the center (on shared edge) nodes are tied d) Tie constraints between 3 different order elements (case 2) (possible with `NonConformingGrid`?) o -- o -- o ------------- o | | | o o o | | | | o -- o -- o | | | | o o o | | | | o -- o -- o ------------- o

KnutAM · 2026-02-18T20:26:48Z

And another comment: To make this PR a bit more manageable, maybe the ability to deal with nonconformity can be extracted out and done first? Seems like that would be a usable components on its own. The same if there are other parts that are useful on their own, that will make reviewing easier down the line IMO.

koehlerson · 2026-02-18T21:35:59Z

Very cool and detailed analysis! 🤖

Claude Code is quite helpful for these code monkey tasks. 🤠

I personally want to get the iterators working before merging because they are crucial for the performance. You visit by construction entities once (by utilizing 2:1 balance information and other things that are guaranteed) and get on the fly hanging nodes (geometric ones). I already understood a big part of the coarse idea but I need @termi-official s help probably to understand the last bits and pieces to write the first sketch of it. As written in the claude report a large junk of the base function is already implemented and works as expected. I also think that @termi-official should comment a bit on code design regarding nonconformity information. From my gut feeling I agree that it should be part of the Dofhandler (or probably the AdaptiveDofHandler)

koehlerson added 30 commits August 2, 2021 18:56

add lookup tables and helper functions

6c830d7

clean up and tests for lookup

b47f930

added parent and child_id

294b463

change to UInt8

61fba0e

fixed parent and included child_id for 2d case

bbcac4d

fixed parent and included child_id for 2d case

3c51d2c

add check in parent for root node, added test for that

e4486cb

add some comment about child_id ordering

a60ef58

further test

e827fa4

added descendants, face edge and corner_neighbor, descendants, edge a…

81290c4

…nd corner_neighbor need tests

shifted morton index by 1 and included morton index constructor tests

689997b

include TODOs

5ceba3f

proper docstrings

e70bf11

AssertionError in test for morton index constructor

44d4c98

fixed descendants, need to wait for zulip responds to confirm those c…

e33d859

…hanges. should be correct imho

I think I have found a bug in tests and in child_id, now morton inde…

98d8ac0

…x corresponds to child_id

fix 2d version of child_id, added tests with child_id morton index ma…

5944c57

…pping

fixed edge neighbor, included 2D corner neighbor, included tests for …

fa2509c

…corner and edge neighbor

maxlevel always last arg with default value based on Burstedde et al.

99f2fc6

started adaptive grid

517b6a6

corner neighborhood for 2D

17f145f

face neighbor for 2D

621c091

topology for 3D

96c9392

Merge branch 'master' into mk/adaptivehex

4871b45

cleanup topology from the past

d352bb3

Merge branch 'master' into mk/p4est

a4a6d89

add constructor for Octree and Forest

13c994f

maximumsize of octant

4201ab9

start of refine, not fully working yet

af61a76

Merge branch 'master' into mk/p4est

834936f

[skip ci] test coverage for failing MWEs in 3D

b7492cf

termi-official and others added 5 commits June 18, 2024 12:55

Derp

d50665d

typo in hanging nodes; doesn't resolve bug

8461ed6

failing example; something fishy in the conformity info

30a3b36

Fix shared edge node detection.

9b38d62

heat_adaptivity 3D version for debugging

b913af4

bplcn reviewed Dec 13, 2024

View reviewed changes

koehlerson and others added 4 commits February 11, 2026 15:24

pre-commit hook

91252dd

fix L2projection by passing conformity constraint to allocate_matrix …

0a62bb5

…and update adaptive heat for doc build

termi-official reviewed Feb 12, 2026

View reviewed changes

docs/src/literate-tutorials/heat_adaptivity.jl Outdated Show resolved Hide resolved

koehlerson added 3 commits February 12, 2026 12:54

don't burn the CI

a2a2e79

delete unused analytic gradient from example

a4c5415

add dispatch without ch in L2

51d405e

termi-official reviewed Feb 12, 2026

View reviewed changes

KnutAM reviewed Feb 12, 2026

View reviewed changes

KnutAM reviewed Feb 18, 2026

View reviewed changes

	@assert length(ch.dh.field_names) == 1 "Multiple fields not supported yet."
	@assert length(ch.dh.field_names) == 1 "Multiple fields not supported yet."

		function _assemble_L2_matrix(dh::DofHandler, ch::ConstraintHandler, qrs_lhs::Vector{<:QuadratureRule})
		M = Symmetric(allocate_matrix(dh, ch))

Conversation

koehlerson commented Jul 31, 2023 • edited by termi-official Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

TODO

Followup PRs

Uh oh!

termi-official commented Jun 17, 2024

Uh oh!

termi-official commented Jun 17, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

koehlerson commented Feb 12, 2026

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

KnutAM Feb 13, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

KnutAM Feb 12, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

KnutAM left a comment

Choose a reason for hiding this comment

Uh oh!

koehlerson commented Feb 13, 2026

Uh oh!

termi-official commented Feb 13, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

koehlerson commented Feb 18, 2026

AMR Performance Analysis and Optimization Plan

Benchmark Results Summary

Time Breakdown (Largest Cases)

creategrid Phase Breakdown (3D, 11264 cells)

balancetree Internal Breakdown (3D, 512 leaves/tree)

Identified Bottlenecks (Priority Order)

1. hangingnodes() -- THE dominant bottleneck (89% of creategrid in 3D)

2. balanceforest!() -- Convergence loop with repeated balancetree

3. balancetree() -- O(n^2) parent check

4. creategrid Phase 2 -- Inter-octree node merging

5. Dict overhead in creategrid

The IBWG2015 Topology Iterator Approach

Background

What the paper proposes

How the topology iterator works

What's already implemented from IBWG2015 in BWG.jl

Why the iterator approach is the right path forward

Argument: Iterator vs. quick fixes

Detailed Plan for the Topology Iterator Implementation

Step 1: Core forest_iterate skeleton

Step 2: Recursive volume descent with split_array

Step 3: Face descent with hanging detection

Step 4: LNodes as iterator callbacks

koehlerson commented Jul 31, 2023 •

edited by termi-official

Loading

KnutAM Feb 13, 2026 •

edited

Loading

KnutAM Feb 12, 2026 •

edited

Loading

termi-official commented Feb 13, 2026 •

edited

Loading

1. `hangingnodes()` -- THE dominant bottleneck (89% of creategrid in 3D)

2. `balanceforest!()` -- Convergence loop with repeated `balancetree`

3. `balancetree()` -- O(n^2) parent check

4. `creategrid` Phase 2 -- Inter-octree node merging

5. Dict overhead in `creategrid`

Step 1: Core `forest_iterate` skeleton

Step 2: Recursive volume descent with `split_array`

Step 5: Rewrite `creategrid` using the iterator

Phase 1: Fix `hangingnodes()` -- Expected 10-50x speedup on creategrid

Phase 2: Fix `balance_face/corner/edge` -- Expected 5-10x speedup on balanceforest

Phase 3: Fix `balancetree()` parent check -- Expected 2-5x speedup on balancetree

Phase 4: Dirty tree tracking in `balanceforest!()`

KnutAM commented Feb 18, 2026 •

edited

Loading