diff --git a/scripts/install/common/dependency_versions.sh b/scripts/install/common/dependency_versions.sh
index 86f22fe..94b5180 100644
--- a/scripts/install/common/dependency_versions.sh
+++ b/scripts/install/common/dependency_versions.sh
@@ -2,25 +2,25 @@
 # Central version control for all dependencies
 
 # Portability libraries
-export CAMP_VER="v2025.09.2"
-export RAJA_VER="v2025.09.1"
+export CAMP_VER="v2025.12.0"
+export RAJA_VER="v2025.12.2"
 #export UMPIRE_VER="v2025.09.0"
 # For now we need something a little pass the v2025.09.0 release
 # for Umpire as we need a small bug fix for any build with Umpire
-export UMPIRE_VER="54a1909e91ce9604328977974e9b1002bf9f8781"
-export CHAI_VER="v2025.09.1"
+export UMPIRE_VER="v2025.12.0"
+export CHAI_VER="v2025.12.0"
 
 # Material models
 export EXACMECH_REPO="https://github.com/LLNL/ExaCMech.git"
 export EXACMECH_BRANCH="develop"
 
 # FEM infrastructure
-export HYPRE_VER="v2.32.0"
+export HYPRE_VER="v3.1.0"
 export METIS_VER="5.1.0"
 export METIS_URL="https://mfem.github.io/tpls/metis-${METIS_VER}.tar.gz"
 
 export MFEM_REPO="https://github.com/rcarson3/mfem.git"
-export MFEM_BRANCH="exaconstit-dev"
+export MFEM_BRANCH="exaconstit-latest"
 
 # Main application
 export EXACONSTIT_REPO="https://github.com/llnl/ExaConstit.git"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 24e830a..af8cd1a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,6 +21,7 @@ set(EXACONSTIT_HEADERS
     postprocessing/mechanics_lightup.hpp
     sim_state/simulation_state.hpp
     solvers/mechanics_solver.hpp
+    solvers/trust_region_solver.hpp
     utilities/dynamic_function_loader.hpp
     utilities/mechanics_kernels.hpp
     utilities/mechanics_log.hpp
@@ -59,6 +60,7 @@ set(EXACONSTIT_SOURCES
     postprocessing/mechanics_lightup.cpp
     sim_state/simulation_state.cpp
     solvers/mechanics_solver.cpp
+    solvers/trust_region_solver.cpp
     utilities/mechanics_kernels.cpp
     utilities/unified_logger.cpp
     )
diff --git a/src/boundary_conditions/BCData.cpp b/src/boundary_conditions/BCData.cpp
index 3714bc1..334e650 100644
--- a/src/boundary_conditions/BCData.cpp
+++ b/src/boundary_conditions/BCData.cpp
@@ -64,7 +64,7 @@ void BCData::SetScales() {
     }
 }
 
-void BCData::GetComponents(int id, mfem::Array<bool>& component) {
+void BCData::GetComponents(int id, std::array<bool, 3>& component) {
     switch (id) {
     case 0:
         component[0] = false;
diff --git a/src/boundary_conditions/BCData.hpp b/src/boundary_conditions/BCData.hpp
index 075e46b..184cb5e 100644
--- a/src/boundary_conditions/BCData.hpp
+++ b/src/boundary_conditions/BCData.hpp
@@ -5,6 +5,7 @@
 #include "mfem.hpp"
 #include "mfem/linalg/vector.hpp"
 
+#include <array>
 #include <fstream>
 
 /**
@@ -101,6 +102,6 @@ class BCData {
      * - id = 6: (true, false, true)
      * - id = 7: (true, true, true)
      */
-    static void GetComponents(int id, mfem::Array<bool>& component);
+    static void GetComponents(int id, std::array<bool, 3>& component);
 };
 #endif
diff --git a/src/boundary_conditions/BCManager.cpp b/src/boundary_conditions/BCManager.cpp
index 5f0e7db..312a685 100644
--- a/src/boundary_conditions/BCManager.cpp
+++ b/src/boundary_conditions/BCManager.cpp
@@ -13,14 +13,12 @@ void BCManager::UpdateBCData(std::unordered_map<std::string, mfem::Array<int>>&
     ess_bdr["total"] = 0;
     scale = 0.0;
 
-    auto ess_comp = map_ess_comp["total"].find(step)->second;
-    auto ess_id = map_ess_id["total"].find(step)->second;
+    const auto& ess_comp = map_ess_comp["total"].find(step)->second;
+    const auto& ess_id = map_ess_id["total"].find(step)->second;
 
-    mfem::Array<bool> cmp_row;
-    cmp_row.SetSize(3);
+    std::array<bool, 3> cmp_row;
 
     component["total"] = false;
-    cmp_row = false;
 
     for (size_t i = 0; i < ess_id.size(); ++i) {
         // set the active boundary attributes
@@ -48,19 +46,17 @@ void BCManager::UpdateBCData(mfem::Array<int>& ess_bdr,
 
     // The size here is set explicitly
     component.SetSize(ess_bdr.Size(), 3);
-    mfem::Array<bool> cmp_row;
-    cmp_row.SetSize(3);
+    std::array<bool, 3> cmp_row;
 
     component = false;
-    cmp_row = false;
 
     if (map_ess_vel.find(step) == map_ess_vel.end()) {
         return;
     }
 
-    auto ess_vel = map_ess_vel.find(step)->second;
-    auto ess_comp = map_ess_comp["ess_vel"].find(step)->second;
-    auto ess_id = map_ess_id["ess_vel"].find(step)->second;
+    const auto& ess_vel = map_ess_vel.find(step)->second;
+    const auto& ess_comp = map_ess_comp["ess_vel"].find(step)->second;
+    const auto& ess_id = map_ess_id["ess_vel"].find(step)->second;
 
     for (size_t i = 0; i < ess_id.size(); ++i) {
         // set the active boundary attributes
@@ -111,19 +107,17 @@ void BCManager::UpdateBCData(mfem::Array<int>& ess_bdr,
 
     // The size here is set explicitly
     component.SetSize(ess_bdr.Size(), 3);
-    mfem::Array<bool> cmp_row;
-    cmp_row.SetSize(3);
+    std::array<bool, 3> cmp_row;
 
     component = false;
-    cmp_row = false;
 
     if (map_ess_vgrad.find(step) == map_ess_vgrad.end()) {
         return;
     }
 
-    auto ess_vgrad = map_ess_vgrad.find(step)->second;
-    auto ess_comp = map_ess_comp["ess_vgrad"].find(step)->second;
-    auto ess_id = map_ess_id["ess_vgrad"].find(step)->second;
+    const auto& ess_vgrad = map_ess_vgrad.find(step)->second;
+    const auto& ess_comp = map_ess_comp["ess_vgrad"].find(step)->second;
+    const auto& ess_id = map_ess_id["ess_vgrad"].find(step)->second;
 
     for (size_t i = 0; i < ess_vgrad.size(); ++i) {
         data[i] = ess_vgrad.at(i);
diff --git a/src/fem_operators/mechanics_integrators.cpp b/src/fem_operators/mechanics_integrators.cpp
index 9ade98d..2bc9d9d 100644
--- a/src/fem_operators/mechanics_integrators.cpp
+++ b/src/fem_operators/mechanics_integrators.cpp
@@ -667,6 +667,113 @@ void ExaNLFIntegrator::AddMultGradPA(const mfem::Vector& x, mfem::Vector& y) con
     } // End of if statement
 }
 
+// -----------------------------------------------------------------------------
+// ExaNLFIntegrator::AddMultTransposeGradPA
+//
+// Native PA kernel computing y += K^T * x where K = B^T D B is the standard
+// (non-BBar) tangent stiffness. Mirrors AddMultGradPA exactly except for the
+// contraction order against the assembled 4th-order tensor D.
+//
+// Algorithm per element, per quadrature point:
+//   1. Compute physical velocity gradient from input vector and shape function
+//      derivatives:
+//         Gx(i,k) = sum_a Gt(a,i,qpt) * X(a,k,elem)
+//      This is the same operation as the forward kernel since B is independent
+//      of the gradient transposition.
+//
+//   2. Apply the TRANSPOSED D tensor contraction:
+//         T(l,n) = sum_{i,k} D(i,k,l,n,qpt,elem) * Gx(i,k)
+//      whereas the forward kernel does
+//         T(i,k) = sum_{l,n} D(i,k,l,n,qpt,elem) * Gx(l,n)
+//      The difference is *which pair* of D's indices are summed against Gx.
+//      For symmetric C, D has major symmetry D(i,k,l,n) = D(l,n,i,k) and the
+//      two contractions agree; for non-symmetric C they disagree.
+//
+//   3. Apply test-function gradients (same operation as forward kernel):
+//         Y(a,n) += sum_l Gt(a,l,qpt) * T(l,n)
+//
+// All quadrature weights and Jacobian determinants are baked into D from the
+// AssembleGradPA step, so this kernel does not need to reapply them.
+// -----------------------------------------------------------------------------
+void ExaNLFIntegrator::AddMultTransposeGradPA(const mfem::Vector &x,
+                                              mfem::Vector &y) const
+{
+    CALI_CXX_MARK_SCOPE("enlfi_amTGPA");
+    if ((space_dims == 1) || (space_dims == 2)) {
+        MFEM_ABORT("Dimensions of 1 or 2 not supported.");
+    }
+    else {
+        const int dim = 3;
+        const int DIM3 = 3;
+        const int DIM6 = 6;
+
+        std::array<RAJA::idx_t, DIM3> perm3 {{ 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM6> perm6 {{ 5, 4, 3, 2, 1, 0 } };
+
+        // D tensor from AssembleGradPA: D(elem, qpt, i, k, l, n)
+        // The leading dim being elem matches the ordering used in the forward kernel.
+        RAJA::Layout<DIM6> layout_tensor =
+            RAJA::make_permuted_layout({{ dim, dim, dim, dim, nqpts, nelems } }, perm6);
+        RAJA::View<const double, RAJA::Layout<DIM6, RAJA::Index_type, 0> > D(pa_dmat.Read(),
+                                                                            layout_tensor);
+
+        // Field variables: input/output E-vectors
+        RAJA::Layout<DIM3> layout_field = RAJA::make_permuted_layout({{ nnodes, dim, nelems } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > X(x.Read(), layout_field);
+        RAJA::View<double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Y(y.ReadWrite(), layout_field);
+
+        // Reference shape function derivatives: Gt(node, dim, qpt)
+        RAJA::Layout<DIM3> layout_grads = RAJA::make_permuted_layout({{ nnodes, dim, nqpts } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Gt(grad.Read(), layout_grads);
+
+        const int nqpts_ = nqpts;
+        const int dim_ = dim;
+        const int nnodes_ = nnodes;
+
+        mfem::forall(nelems, [=] MFEM_HOST_DEVICE(int i_elems) {
+            for (int j_qpts = 0; j_qpts < nqpts_; j_qpts++) {
+            // Step 1: Compute velocity gradient at this quadrature point
+            //   Gx(i, k) = sum_a Gt(a, i, qpt) * X(a, k, elem)
+            double Gx[3][3];
+            for (int ii = 0; ii < dim_; ii++) {
+                for (int kk = 0; kk < dim_; kk++) {
+                    Gx[ii][kk] = 0.0;
+                    for (int a = 0; a < nnodes_; a++) {
+                        Gx[ii][kk] += Gt(a, ii, j_qpts) * X(a, kk, i_elems);
+                    }
+                }
+            }
+
+            // Step 2: Apply TRANSPOSED D contraction
+            //   T(l, n) = sum_{i,k} D(i, k, l, n, qpt, elem) * Gx(i, k)
+            // Compare to forward kernel:
+            //   T(i, k) = sum_{l,n} D(i, k, l, n, qpt, elem) * Gx(l, n)
+            double T[3][3];
+            for (int ll = 0; ll < dim_; ll++) {
+                for (int nn = 0; nn < dim_; nn++) {
+                    T[ll][nn] = 0.0;
+                    for (int ii = 0; ii < dim_; ii++) {
+                        for (int kk = 0; kk < dim_; kk++) {
+                        T[ll][nn] += D(i_elems, j_qpts, ii, kk, ll, nn) * Gx[ii][kk];
+                        }
+                    }
+                }
+            }
+
+            // Step 3: Apply test-function gradients (same as forward kernel)
+            //   Y(a, n) += sum_l Gt(a, l, qpt) * T(l, n)
+            for (int nn = 0; nn < dim_; nn++) {
+                for (int ll = 0; ll < dim_; ll++) {
+                    for (int a = 0; a < nnodes_; a++) {
+                        Y(a, nn, i_elems) += Gt(a, ll, j_qpts) * T[ll][nn];
+                    }
+                }
+            }
+            } // End of nqpts
+        }); // End of nelems
+    } // End of else (3D path)
+}
+
 // This assembles the diagonal of our LHS which can be used as a preconditioner
 void ExaNLFIntegrator::AssembleGradDiagonalPA(mfem::Vector& diag) const {
     CALI_CXX_MARK_SCOPE("enlfi_AssembleGradDiagonalPA");
@@ -1257,6 +1364,70 @@ void ICExaNLFIntegrator::AssembleElementGrad(const mfem::FiniteElement& el,
     return;
 }
 
+// -----------------------------------------------------------------------------
+// ICExaNLFIntegrator::AssembleGradPA
+//
+// Sets up geometric data and ensures element-averaged derivatives are ready.
+// The B-bar gradient PA does NOT pre-assemble a D tensor (unlike the base
+// class) because the volumetric correction couples element-constant data
+// (volume-averaged derivatives N̄) with per-quadrature-point data (C, adj(J))
+// in a way that does not fold cleanly into a single pre-assembled tensor.
+// Instead, AddMultGradPA / AddMultTransposeGradPA access C directly from the
+// quadrature function and apply the B-bar action on the fly in physical space.
+// -----------------------------------------------------------------------------
+void ICExaNLFIntegrator::AssembleGradPA(const mfem::Vector &/* x */,
+                                        const mfem::FiniteElementSpace &fes)
+{
+    this->AssembleGradPA(fes);
+}
+
+void ICExaNLFIntegrator::AssembleGradPA(const mfem::FiniteElementSpace &fes)
+{
+    CALI_CXX_MARK_SCOPE("icenlfi_assembleGradPA");
+
+    mfem::Mesh *mesh = fes.GetMesh();
+    const mfem::FiniteElement &el = *fes.GetFE(0);
+    space_dims = el.GetDim();
+    const mfem::IntegrationRule *ir =
+        &(mfem::IntRules.Get(el.GetGeomType(), 2 * el.GetOrder() + 1));
+
+    nqpts = ir->GetNPoints();
+    nnodes = el.GetDof();
+    nelems = fes.GetNE();
+
+    if ((space_dims == 1) || (space_dims == 2)) {
+        MFEM_ABORT("Dimensions of 1 or 2 not supported.");
+    }
+
+    // Cache geometric factors (Jacobians at quadrature points)
+    geom = mesh->GetGeometricFactors(*ir, mfem::GeometricFactors::JACOBIANS);
+
+    // Cache reference shape function derivatives
+    if (grad.Size() != (nqpts * space_dims * nnodes)) {
+        grad.SetSize(nqpts * space_dims * nnodes, mfem::Device::GetMemoryType());
+        {
+            mfem::DenseMatrix DSh;
+            const int offset = nnodes * space_dims;
+            double *qpts_dshape_data = grad.HostReadWrite();
+            for (int i = 0; i < nqpts; i++) {
+            const mfem::IntegrationPoint &ip = ir->IntPoint(i);
+            DSh.UseExternalData(&qpts_dshape_data[offset * i], nnodes, space_dims);
+            el.CalcDShape(ip, DSh);
+            }
+        }
+        grad.UseDevice(true);
+    }
+
+    // Element-averaged derivatives N̄(a, k, elem) are computed by AssemblePA().
+    // If they have not been computed yet, force a call now so the gradient PA
+    // kernels can use them. The AssemblePA path is idempotent and safe to call
+    // even if it has been called previously (it re-zeroes and recomputes).
+    if (elem_deriv_shapes.Size() != (nnodes * space_dims * nelems)) {
+        this->AssemblePA(fes);
+    }
+}
+
+
 /// Method defining element assembly.
 /** The result of the element assembly is added and stored in the @a emat
     Vector. */
@@ -1265,6 +1436,7 @@ void ICExaNLFIntegrator::AssembleGradEA(const mfem::Vector& /*x*/,
                                         mfem::Vector& emat) {
     AssembleEA(fes, emat);
 }
+
 void ICExaNLFIntegrator::AssembleEA(const mfem::FiniteElementSpace& fes, mfem::Vector& emat) {
     CALI_CXX_MARK_SCOPE("icenlfi_assembleEA");
     const mfem::FiniteElement& el = *fes.GetFE(0);
@@ -2014,6 +2186,377 @@ void ICExaNLFIntegrator::AssemblePA(const mfem::FiniteElementSpace& fes) {
     } // End of space dims if else
 }
 
+// -----------------------------------------------------------------------------
+// ICExaNLFIntegrator::AddMultGradPA
+//
+// Native B-bar tangent stiffness PA action: y += K̄ * x where
+//   K̄ = ∫ B̄^T C B̄ dΩ
+// and B̄ is the B-bar strain-displacement matrix from Hughes (1980).
+//
+// Because B̄ couples element-constant volume-averaged data with per-qpt data,
+// we work in physical space and access C directly from the simulation state's
+// tangent stiffness quadrature function.
+//
+// Algorithm per element, per quadrature point (q):
+//   1. Hoist tr_bar (element-constant) outside the qpt loop:
+//        tr_bar = sum_{a,k} N̄(a,k) * V(a,k)
+//      This is the volume-averaged trace of the velocity gradient that B̄
+//      uses in place of the per-qpt trace.
+//
+//   2. Compute the adjugate matrix and Jacobian determinant from the cached
+//      Jacobian. Adjugate is used to transform reference derivatives Gt to
+//      physical derivatives:
+//        dN(a,j) = (1/detJ) * sum_k Gt(a,k,q) * adj(j,k)
+//      (Adjugate uses inverse-transpose convention; same as in the standard
+//      ExaNLFIntegrator AssembleGradPA kernel.)
+//
+//   3. Compute physical velocity gradient:
+//        L(i,j) = sum_a dN(a,j) * V(a,i)
+//
+//   4. Compute B-bar trace correction:
+//        Δtr = (tr_bar - tr(L)) / 3
+//      and modified velocity gradient:
+//        L̄(i,j) = L(i,j) + δ_ij * Δtr
+//      which replaces the volumetric trace of L with tr_bar (Hughes' B-bar).
+//
+//   5. Apply material tangent (forward direction):
+//        σ'(j,k) = sum_{l,m} C(j,k,l,m) * L̄(l,m)
+//      C is fetched on the fly from the tangent_stiffness quadrature function.
+//
+//   6. Compute pressure (volumetric) part of σ':
+//        p' = (1/3) * tr(σ')
+//
+//   7. Accumulate into Y with B-bar test side. The test side replaces the
+//      pressure contribution to nodal forces using the volume-averaged
+//      derivatives N̄ in place of the per-qpt dN:
+//        Y(a,k) += [sum_j dN(a,j) σ'(j,k) + (N̄(a,k) - dN(a,k)) p'] * w * detJ
+//      The first term is the standard B^T σ' force, the second redirects the
+//      pressure piece through N̄.
+//
+// Verification properties:
+//   - For symmetric C, the result must equal the forward action of any
+//     symmetric formulation (B̄^T C B̄ is symmetric).
+//   - For a uniform-Jacobian mesh where tr_bar agrees with the per-qpt
+//     trace, Δtr → 0 at every qpt and the result must match the standard
+//     (non-B-bar) result.
+// -----------------------------------------------------------------------------
+void ICExaNLFIntegrator::AddMultGradPA(const mfem::Vector &x,
+                                       mfem::Vector &y) const
+{
+    CALI_CXX_MARK_SCOPE("icenlfi_amGPA");
+    if ((space_dims == 1) || (space_dims == 2)) {
+        MFEM_ABORT("Dimensions of 1 or 2 not supported.");
+    }
+    else {
+        const int dim = 3;
+        const int DIM3 = 3;
+        const int DIM4 = 4;
+        const int DIM6 = 6;
+
+        std::array<RAJA::idx_t, DIM3> perm3 {{ 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM4> perm4 {{ 3, 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM6> perm6 {{ 5, 4, 3, 2, 1, 0 } };
+
+        // Input / output E-vectors
+        RAJA::Layout<DIM3> layout_field = RAJA::make_permuted_layout({{ nnodes, dim, nelems } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > X(x.Read(), layout_field);
+        RAJA::View<double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Y(y.ReadWrite(), layout_field);
+
+        // Reference shape function derivatives Gt(node, dim, qpt)
+        RAJA::Layout<DIM3> layout_grads = RAJA::make_permuted_layout({{ nnodes, dim, nqpts } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Gt(grad.Read(), layout_grads);
+
+        // Element-averaged derivatives N̄(node, dim, elem)
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Nbar(elem_deriv_shapes.Read(),
+                                                                                layout_field);
+
+        // Mesh Jacobians J(dim, dim, qpt, elem) — column-major mfem convention
+        RAJA::Layout<DIM4> layout_jac = RAJA::make_permuted_layout({{ dim, dim, nqpts, nelems } }, perm4);
+        RAJA::View<const double, RAJA::Layout<DIM4, RAJA::Index_type, 0> > J_data(geom->J.Read(), layout_jac);
+
+        // Material tangent C(j, k, l, m, qpt, elem) from quadrature function
+        auto tangent_qf = m_sim_state->GetQuadratureFunction("tangent_stiffness");
+        RAJA::Layout<DIM6> layout_C = RAJA::make_permuted_layout(
+            {{ dim, dim, dim, dim, nqpts, nelems } }, perm6);
+        RAJA::View<const double, RAJA::Layout<DIM6, RAJA::Index_type, 0> > C(tangent_qf->Read(), layout_C);
+
+        // Integration weights from the tangent stiffness QF integration rule
+        const mfem::IntegrationRule &ir =
+            tangent_qf->GetSpace()->GetIntRule(0);
+        auto W = ir.GetWeights().Read();
+
+        const int nqpts_ = nqpts;
+        const int dim_ = dim;
+        const int nnodes_ = nnodes;
+
+        mfem::forall(nelems, [=] MFEM_HOST_DEVICE(int e) {
+            // Step 1: Hoist tr_bar outside the qpt loop (element-constant)
+            double tr_bar = 0.0;
+            for (int a = 0; a < nnodes_; a++) {
+            for (int k = 0; k < dim_; k++) {
+                tr_bar += Nbar(a, k, e) * X(a, k, e);
+            }
+            }
+
+            for (int q = 0; q < nqpts_; q++) {
+            // Step 2: Compute adjugate and Jacobian determinant
+            const double J11 = J_data(0, 0, q, e), J12 = J_data(1, 0, q, e),
+                            J13 = J_data(2, 0, q, e);
+            const double J21 = J_data(0, 1, q, e), J22 = J_data(1, 1, q, e),
+                            J23 = J_data(2, 1, q, e);
+            const double J31 = J_data(0, 2, q, e), J32 = J_data(1, 2, q, e),
+                            J33 = J_data(2, 2, q, e);
+
+            double adj[9];
+            adj[0] = (J22 * J33) - (J23 * J32); // 0,0
+            adj[1] = (J23 * J31) - (J21 * J33); // 0,1
+            adj[2] = (J21 * J32) - (J22 * J31); // 0,2
+            adj[3] = (J13 * J32) - (J12 * J33); // 1,0
+            adj[4] = (J11 * J33) - (J13 * J31); // 1,1
+            adj[5] = (J12 * J31) - (J11 * J32); // 1,2
+            adj[6] = (J12 * J23) - (J13 * J22); // 2,0
+            adj[7] = (J13 * J21) - (J11 * J23); // 2,1
+            adj[8] = (J11 * J22) - (J12 * J21); // 2,2
+
+            const double detJ = J11 * adj[0] + J21 * adj[3] + J31 * adj[6];
+            const double idetJ = 1.0 / detJ;
+            const double w_detJ = W[q] * detJ;
+
+            // Step 3: Physical velocity gradient L(i,j) = sum_a dN(a,j) * V(a,i)
+            // We compute dN(a, :) on-the-fly from Gt and adj.
+            double L[3][3] = {{ 0.0 } };
+            for (int a = 0; a < nnodes_; a++) {
+                double dNa[3];
+                for (int j = 0; j < dim_; j++) {
+                    dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] +
+                                    Gt(a, 1, q) * adj[j * 3 + 1] +
+                                    Gt(a, 2, q) * adj[j * 3 + 2]);
+                }
+                for (int i = 0; i < dim_; i++) {
+                    for (int j = 0; j < dim_; j++) {
+                        L[i][j] += dNa[j] * X(a, i, e);
+                    }
+                }
+            }
+
+            // Step 4: B-bar trace correction
+            const double tr_std = L[0][0] + L[1][1] + L[2][2];
+            const double dtr = (tr_bar - tr_std) / 3.0;
+
+            double Lbar[3][3];
+            for (int i = 0; i < dim_; i++) {
+                for (int j = 0; j < dim_; j++) {
+                    Lbar[i][j] = L[i][j];
+                }
+            }
+            Lbar[0][0] += dtr;
+            Lbar[1][1] += dtr;
+            Lbar[2][2] += dtr;
+
+            // Step 5: Apply material tangent — forward contraction
+            //   σ'(j, k) = sum_{l,m} C(j, k, l, m) * L̄(l, m)
+            double sigma[3][3] = {{ 0.0 } };
+            for (int j = 0; j < dim_; j++) {
+                for (int k = 0; k < dim_; k++) {
+                    for (int l = 0; l < dim_; l++) {
+                        for (int m = 0; m < dim_; m++) {
+                        sigma[j][k] += C(j, k, l, m, q, e) * Lbar[l][m];
+                        }
+                    }
+                }
+            }
+
+            // Step 6: Pressure (volumetric) part of σ'
+            const double p = (sigma[0][0] + sigma[1][1] + sigma[2][2]) / 3.0;
+
+            // Step 7: Accumulate forces with B-bar test side
+            //   Y(a, k) += [sum_j dN(a,j) σ'(j,k) + (N̄(a,k) - dN(a,k)) p] * w * detJ
+            for (int a = 0; a < nnodes_; a++) {
+                double dNa[3];
+                for (int j = 0; j < dim_; j++) {
+                    dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] +
+                                    Gt(a, 1, q) * adj[j * 3 + 1] +
+                                    Gt(a, 2, q) * adj[j * 3 + 2]);
+                }
+                for (int k = 0; k < dim_; k++) {
+                    double f_std = 0.0;
+                    for (int j = 0; j < dim_; j++) {
+                        f_std += dNa[j] * sigma[j][k];
+                    }
+                    double f_bbar = (Nbar(a, k, e) - dNa[k]) * p;
+                    Y(a, k, e) += (f_std + f_bbar) * w_detJ;
+                }
+            }
+            } // End of qpts
+        }); // End of nelems
+    } // End of else (3D path)
+}
+
+
+// -----------------------------------------------------------------------------
+// ICExaNLFIntegrator::AddMultTransposeGradPA
+//
+// Native transposed B-bar tangent stiffness PA action: y += K̄^T * x.
+//
+// This is structurally IDENTICAL to AddMultGradPA except for one line: the
+// material tangent contraction uses C(l,m,j,k) instead of C(j,k,l,m). The
+// B-bar geometry (N̄, dN, trace correction, pressure redirection) is the
+// same on both sides of K̄ = B̄^T C B̄ because:
+//   (B̄^T C B̄)^T = B̄^T C^T B̄
+// — only the middle factor C transposes; the outer B̄^T and B̄ remain in
+// place.
+//
+// For symmetric C, this kernel produces results identical to AddMultGradPA
+// (a useful verification check). For non-symmetric C (crystal plasticity
+// with non-associated flow or non-symmetric Schmid coupling) it produces
+// genuinely different results, as required for correct trust-region
+// Cauchy point computation.
+// -----------------------------------------------------------------------------
+void ICExaNLFIntegrator::AddMultTransposeGradPA(const mfem::Vector &x,
+                                                mfem::Vector &y) const
+{
+    CALI_CXX_MARK_SCOPE("icenlfi_amTGPA");
+    if ((space_dims == 1) || (space_dims == 2)) {
+        MFEM_ABORT("Dimensions of 1 or 2 not supported.");
+    }
+    else {
+        const int dim = 3;
+        const int DIM3 = 3;
+        const int DIM4 = 4;
+        const int DIM6 = 6;
+
+        std::array<RAJA::idx_t, DIM3> perm3 {{ 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM4> perm4 {{ 3, 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM6> perm6 {{ 5, 4, 3, 2, 1, 0 } };
+
+        RAJA::Layout<DIM3> layout_field = RAJA::make_permuted_layout({{ nnodes, dim, nelems } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > X(x.Read(), layout_field);
+        RAJA::View<double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Y(y.ReadWrite(), layout_field);
+
+        RAJA::Layout<DIM3> layout_grads = RAJA::make_permuted_layout({{ nnodes, dim, nqpts } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Gt(grad.Read(), layout_grads);
+
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Nbar(elem_deriv_shapes.Read(),
+                                                                                layout_field);
+
+        RAJA::Layout<DIM4> layout_jac = RAJA::make_permuted_layout({{ dim, dim, nqpts, nelems } }, perm4);
+        RAJA::View<const double, RAJA::Layout<DIM4, RAJA::Index_type, 0> > J_data(geom->J.Read(), layout_jac);
+
+        auto tangent_qf = m_sim_state->GetQuadratureFunction("tangent_stiffness");
+        RAJA::Layout<DIM6> layout_C = RAJA::make_permuted_layout(
+            {{ dim, dim, dim, dim, nqpts, nelems } }, perm6);
+        RAJA::View<const double, RAJA::Layout<DIM6, RAJA::Index_type, 0> > C(tangent_qf->Read(), layout_C);
+
+        const mfem::IntegrationRule &ir =
+            tangent_qf->GetSpace()->GetIntRule(0);
+        auto W = ir.GetWeights().Read();
+
+        const int nqpts_ = nqpts;
+        const int dim_ = dim;
+        const int nnodes_ = nnodes;
+
+        mfem::forall(nelems, [=] MFEM_HOST_DEVICE(int e) {
+            // Step 1: Hoist tr_bar (element-constant)
+            double tr_bar = 0.0;
+            for (int a = 0; a < nnodes_; a++) {
+            for (int k = 0; k < dim_; k++) {
+                tr_bar += Nbar(a, k, e) * X(a, k, e);
+            }
+            }
+
+            for (int q = 0; q < nqpts_; q++) {
+            // Step 2: Adjugate and Jacobian determinant
+            const double J11 = J_data(0, 0, q, e), J12 = J_data(1, 0, q, e),
+                            J13 = J_data(2, 0, q, e);
+            const double J21 = J_data(0, 1, q, e), J22 = J_data(1, 1, q, e),
+                            J23 = J_data(2, 1, q, e);
+            const double J31 = J_data(0, 2, q, e), J32 = J_data(1, 2, q, e),
+                            J33 = J_data(2, 2, q, e);
+
+            double adj[9];
+            adj[0] = (J22 * J33) - (J23 * J32);
+            adj[1] = (J23 * J31) - (J21 * J33);
+            adj[2] = (J21 * J32) - (J22 * J31);
+            adj[3] = (J13 * J32) - (J12 * J33);
+            adj[4] = (J11 * J33) - (J13 * J31);
+            adj[5] = (J12 * J31) - (J11 * J32);
+            adj[6] = (J12 * J23) - (J13 * J22);
+            adj[7] = (J13 * J21) - (J11 * J23);
+            adj[8] = (J11 * J22) - (J12 * J21);
+
+            const double detJ = J11 * adj[0] + J21 * adj[3] + J31 * adj[6];
+            const double idetJ = 1.0 / detJ;
+            const double w_detJ = W[q] * detJ;
+
+            // Step 3: Physical velocity gradient
+            double L[3][3] = {{ 0.0 } };
+            for (int a = 0; a < nnodes_; a++) {
+                double dNa[3];
+                for (int j = 0; j < dim_; j++) {
+                    dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] +
+                                    Gt(a, 1, q) * adj[j * 3 + 1] +
+                                    Gt(a, 2, q) * adj[j * 3 + 2]);
+                }
+                for (int i = 0; i < dim_; i++) {
+                    for (int j = 0; j < dim_; j++) {
+                        L[i][j] += dNa[j] * X(a, i, e);
+                    }
+                }
+            }
+
+            // Step 4: B-bar trace correction
+            const double tr_std = L[0][0] + L[1][1] + L[2][2];
+            const double dtr = (tr_bar - tr_std) / 3.0;
+
+            double Lbar[3][3];
+            for (int i = 0; i < dim_; i++) {
+                for (int j = 0; j < dim_; j++) {
+                    Lbar[i][j] = L[i][j];
+                }
+            }
+            Lbar[0][0] += dtr;
+            Lbar[1][1] += dtr;
+            Lbar[2][2] += dtr;
+
+            // Step 5: TRANSPOSED material tangent contraction
+            //   σ'(j, k) = sum_{l,m} C(l, m, j, k) * L̄(l, m)
+            // (Compare to forward: C(j, k, l, m) * L̄(l, m))
+            double sigma[3][3] = {{ 0.0 } };
+            for (int j = 0; j < dim_; j++) {
+                for (int k = 0; k < dim_; k++) {
+                    for (int l = 0; l < dim_; l++) {
+                        for (int m = 0; m < dim_; m++) {
+                        sigma[j][k] += C(l, m, j, k, q, e) * Lbar[l][m];
+                        }
+                    }
+                }
+            }
+
+            // Step 6: Pressure
+            const double p = (sigma[0][0] + sigma[1][1] + sigma[2][2]) / 3.0;
+
+            // Step 7: Accumulate with B-bar test side (same as forward kernel)
+            for (int a = 0; a < nnodes_; a++) {
+                double dNa[3];
+                for (int j = 0; j < dim_; j++) {
+                    dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] +
+                                    Gt(a, 1, q) * adj[j * 3 + 1] +
+                                    Gt(a, 2, q) * adj[j * 3 + 2]);
+                }
+                for (int k = 0; k < dim_; k++) {
+                    double f_std = 0.0;
+                    for (int j = 0; j < dim_; j++) {
+                        f_std += dNa[j] * sigma[j][k];
+                    }
+                    double f_bbar = (Nbar(a, k, e) - dNa[k]) * p;
+                    Y(a, k, e) += (f_std + f_bbar) * w_detJ;
+                }
+            }
+            } // End of qpts
+        }); // End of nelems
+    } // End of else (3D path)
+}
+
 // Here we're applying the following action operation using the assembled "D" 2nd order
 // tensor found above:
 // y_{ik} = \nabla_{ij}\phi^T_{\epsilon} D_{jk}
diff --git a/src/fem_operators/mechanics_integrators.hpp b/src/fem_operators/mechanics_integrators.hpp
index fb7d4f7..0a761ec 100644
--- a/src/fem_operators/mechanics_integrators.hpp
+++ b/src/fem_operators/mechanics_integrators.hpp
@@ -351,6 +351,35 @@ class ExaNLFIntegrator : public mfem::NonlinearFormIntegrator {
      */
     virtual void AddMultGradPA(const mfem::Vector& x, mfem::Vector& y) const override;
 
+    /**
+     * @brief Apply transposed gradient action via partial assembly.
+     *
+     * @param x Input vector for transposed Jacobian-vector product
+     * @param y Output vector for accumulated result
+     *
+     * Native PA kernel computing y += K^T * x where K = B^T D B is the
+     * tangent stiffness operator. The only computational difference from
+     * AddMultGradPA is the contraction order with the assembled 4th-order
+     * tensor D:
+     *
+     *   Forward (AddMultGradPA):
+     *     T(i,k) = D(i,k,l,n,qpt,elem) * Gx(l,n)   — contract last pair
+     *     Y(a,k) += Gt(a,i,qpt) * T(i,k)
+     *
+     *   Transpose (this method):
+     *     T(l,n) = D(i,k,l,n,qpt,elem) * Gx(i,k)   — contract first pair
+     *     Y(a,n) += Gt(a,l,qpt) * T(l,n)
+     *
+     * For symmetric material tangent C, the two operations are identical.
+     * For non-symmetric C (crystal plasticity), they differ. The transpose
+     * is required for trust-region dogleg solver Cauchy point computation
+     * where the merit function gradient is g = J^T * r, not J * r.
+     *
+     * @note GPU-compatible via mfem::forall
+     * @note Requires prior AssembleGradPA() call for the D tensor
+     */
+    virtual void AddMultTransposeGradPA(const mfem::Vector &x, mfem::Vector &y) const override;
+
     using mfem::NonlinearFormIntegrator::AssemblePA;
     /**
      * @brief Initialize partial assembly data structures for residual operations.
@@ -723,10 +752,82 @@ class ICExaNLFIntegrator : public ExaNLFIntegrator {
                                      const mfem::Vector& /*elfun*/,
                                      mfem::DenseMatrix& elmat) override;
 
-    // This method doesn't easily extend to PA formulation, so we're punting on
-    // it for now.
-    using ExaNLFIntegrator::AddMultGradPA;
-    using ExaNLFIntegrator::AssembleGradPA;
+    /**
+     * @brief Initialize partial assembly data structures for B-bar gradient operations.
+     *
+     * @param fes Finite element space providing mesh and element information
+     *
+     * Sets up the geometric data needed by AddMultGradPA() and
+     * AddMultTransposeGradPA() for the B-bar tangent stiffness operator.
+     *
+     * Unlike the base class AssembleGradPA() which pre-assembles a 4th-order
+     * tensor D, the B-bar version stores only the geometric data (Jacobians,
+     * reference shape function derivatives, and element-averaged derivatives)
+     * and applies the material tangent C on-the-fly inside the kernel. This
+     * is because the B-bar correction couples element-constant data (the
+     * volume-averaged derivatives) with quadrature-point-local data (C and
+     * adj(J)) in a way that doesn't fold cleanly into a single pre-assembled
+     * tensor.
+     *
+     * Setup steps:
+     *   1. Cache space_dims, nqpts, nnodes, nelems from the FES
+     *   2. Get geometric factors (Jacobians at quadrature points) from the mesh
+     *   3. Compute and cache reference shape function derivatives Gt(a, k, qpt)
+     *   4. Ensure element-averaged derivatives N̄(a, k, elem) are available
+     *      (calling AssemblePA() if not yet computed)
+     *
+     * @note Must be called before AddMultGradPA() or AddMultTransposeGradPA()
+     * @note Material tangent C is accessed directly from the simulation state
+     *       quadrature function during the AddMult kernels
+     */
+    virtual void AssembleGradPA(const mfem::FiniteElementSpace &fes) override;
+
+    /// State-ful overload that ignores the state vector @a x.
+    virtual void AssembleGradPA(const mfem::Vector &x, const mfem::FiniteElementSpace &fes) override;
+
+    /**
+     * @brief Apply partial-assembly B-bar tangent stiffness action.
+     *
+     * @param x Input E-vector (nodal velocities)
+     * @param y Output E-vector (accumulated)
+     *
+     * Computes y += K̄ * x where K̄ = ∫ B̄^T C B̄ dΩ is the B-bar tangent.
+     *
+     * Algorithm per element, per quadrature point:
+     *   1. Compute adj(J) and detJ from the cached Jacobian
+     *   2. Compute physical derivatives dN(a,j) on-the-fly from Gt and adj(J)
+     *   3. Compute physical velocity gradient L(i,j) = dN(a,j) V(a,i)
+     *   4. Compute B-bar trace correction Δtr = (tr_bar - tr(L)) / 3
+     *      where tr_bar = N̄(a,k) V(a,k) is element-constant (hoisted)
+     *   5. Modified velocity gradient L̄ = L + δ_ij * Δtr
+     *   6. Apply C: σ'(j,k) = C(j,k,l,m) * L̄(l,m)
+     *   7. Pressure correction p' = (1/3) tr(σ')
+     *   8. Accumulate into Y: standard force + B-bar pressure redirection
+     *      Y(a,k) += [Σ_j dN(a,j) σ'(j,k) + (N̄(a,k) - dN(a,k)) p'] * w * detJ
+     *
+     * @note GPU-compatible via mfem::forall
+     * @note Requires prior AssembleGradPA() call
+     */
+    virtual void AddMultGradPA(const mfem::Vector &x, mfem::Vector &y) const override;
+
+    /**
+     * @brief Apply transposed B-bar tangent stiffness action.
+     *
+     * @param x Input E-vector
+     * @param y Output E-vector (accumulated)
+     *
+     * Computes y += K̄^T * x. Identical to AddMultGradPA except the C
+     * contraction order is swapped:
+     *   Forward:   σ'(j,k) = C(j,k,l,m) * L̄(l,m)
+     *   Transpose: σ'(j,k) = C(l,m,j,k) * L̄(l,m)
+     *
+     * The B-bar geometry (N̄, dN, trace correction, pressure redirection)
+     * is identical for both directions because B̄ appears on both the
+     * trial and test sides of K̄ = B̄^T C B̄, and (B̄^T C B̄)^T = B̄^T C^T B̄.
+     *
+     * @note For symmetric C, this produces identical results to AddMultGradPA
+     */
+    virtual void AddMultTransposeGradPA(const mfem::Vector &x, mfem::Vector &y) const override;
 
     /**
      * @brief Initialize partial assembly data structures for B-bar residual operations.
diff --git a/src/fem_operators/mechanics_operator.cpp b/src/fem_operators/mechanics_operator.cpp
index b95cd74..93b1ebe 100644
--- a/src/fem_operators/mechanics_operator.cpp
+++ b/src/fem_operators/mechanics_operator.cpp
@@ -13,6 +13,15 @@
 #include <iostream>
 #include <stdexcept>
 
+namespace {
+void GetTrueDofsParallel(const mfem::ParGridFunction& gf, mfem::Vector& true_dofs) {
+    // used to do something like:
+    // gf.GetTrueDofs(true_dofs);
+    // but looks like there are issues with that on the GPUs with newer versions of MFEM
+    gf.ParallelAverage(true_dofs);
+}
+} // namespace
+
 NonlinearMechOperator::NonlinearMechOperator(mfem::Array<int>& ess_bdr,
                                              mfem::Array2D<bool>& ess_bdr_comp,
                                              std::shared_ptr<SimulationState> sim_state)
@@ -259,7 +268,7 @@ void NonlinearMechOperator::CalculateDeformationGradient(mfem::QuadratureFunctio
 
     mfem::Vector x_true(fe_space->TrueVSize(), mfem::Device::GetMemoryType());
 
-    x_cur->GetTrueDofs(x_true);
+    GetTrueDofsParallel(*x_cur, x_true);
     // Takes in k vector and transforms into into our E-vector array
     P->Mult(x_true, px);
     elem_restrict_lex->Mult(px, el_x);
diff --git a/src/mechanics_driver.cpp b/src/mechanics_driver.cpp
index 0e9520e..4f3efca 100644
--- a/src/mechanics_driver.cpp
+++ b/src/mechanics_driver.cpp
@@ -211,7 +211,17 @@ int main(int argc, char* argv[]) {
          */
         mfem::Device device;
         if (toml_opt.solvers.rtmodel == RTModel::GPU) {
+#if defined(MFEM_USE_UMPIRE)
+            device.SetMemoryTypes(mfem::MemoryType::HOST_UMPIRE, mfem::MemoryType::DEVICE_UMPIRE);
+#else
             device.SetMemoryTypes(mfem::MemoryType::HOST_64, mfem::MemoryType::DEVICE);
+#endif
+        } else {
+#if defined(MFEM_USE_UMPIRE)
+            device.SetMemoryTypes(mfem::MemoryType::HOST_UMPIRE, mfem::MemoryType::DEVICE_UMPIRE);
+#else
+            device.SetMemoryTypes(mfem::MemoryType::HOST_64, mfem::MemoryType::DEVICE);
+#endif
         }
         device.Configure(device_config.c_str());
 
diff --git a/src/mfem_expt/partial_qspace.cpp b/src/mfem_expt/partial_qspace.cpp
index 2e0261f..3230313 100644
--- a/src/mfem_expt/partial_qspace.cpp
+++ b/src/mfem_expt/partial_qspace.cpp
@@ -43,6 +43,8 @@ const mfem::Vector& PartialQuadratureSpace::GetGeometricFactorWeights() const {
 void PartialQuadratureSpace::ConstructOffsets() {
     // Set up offsets based on our partial element set
     const int num_partial_elem = local2global.Size();
+    ne = num_partial_elem;
+    full_offset_cache.SetSize(0);
     offsets.SetSize(num_partial_elem + 1);
     int offset = 0;
     for (int i = 0; i < num_partial_elem; i++) {
diff --git a/src/options/option_enum.cpp b/src/options/option_enum.cpp
index 6ae4b99..8749ad7 100644
--- a/src/options/option_enum.cpp
+++ b/src/options/option_enum.cpp
@@ -106,12 +106,15 @@ LinearSolverType string_to_linear_solver_type(const std::string& str) {
 
 /**
  * @brief Convert string to NonlinearSolverType enum
- * @param str String representation of nonlinear solver type ("NR", "NRLS")
+ * @param str String representation of nonlinear solver type ("NR", "NRLS", "TRDOG")
  * @return Corresponding NonlinearSolverType enum value
  */
 NonlinearSolverType string_to_nonlinear_solver_type(const std::string& str) {
     static const std::map<std::string, NonlinearSolverType> mapping = {
-        {"NR", NonlinearSolverType::NR}, {"NRLS", NonlinearSolverType::NRLS}};
+        {"NR",    NonlinearSolverType::NR},
+        {"NRLS",  NonlinearSolverType::NRLS},
+        {"TRDOG", NonlinearSolverType::TRDOG}
+    };
 
     return string_to_enum(str, mapping, NonlinearSolverType::NOTYPE, "nonlinear solver");
 }
diff --git a/src/options/option_parser_v2.cpp b/src/options/option_parser_v2.cpp
index efac46e..cb29e98 100644
--- a/src/options/option_parser_v2.cpp
+++ b/src/options/option_parser_v2.cpp
@@ -790,6 +790,9 @@ void ExaOptions::print_solver_options() const {
     case NonlinearSolverType::NRLS:
         std::cout << "Newton-Raphson with line search\n";
         break;
+    case NonlinearSolverType::TRDOG:
+        std::cout << "Trust-region dogleg (SNLS port)\n";
+        break;
     default:
         std::cout << "Unknown\n";
         break;
@@ -798,6 +801,43 @@ void ExaOptions::print_solver_options() const {
     std::cout << "    Maximum iterations: " << solvers.nonlinear_solver.iter << "\n";
     std::cout << "    Relative tolerance: " << solvers.nonlinear_solver.rel_tol << "\n";
     std::cout << "    Absolute tolerance: " << solvers.nonlinear_solver.abs_tol << "\n";
+
+    // Trust-region parameters: print if either the solver is TRDOG or the user
+    // supplied a [trust_region] sub-table. The latter case is informational —
+    // it lets the user spot misconfigurations where they set TR options without
+    // selecting the TRDOG solver.
+    const bool is_trdog = (solvers.nonlinear_solver.nl_solver == NonlinearSolverType::TRDOG);
+    const bool tr_supplied = solvers.nonlinear_solver.trust_region.has_value();
+
+    if (is_trdog || tr_supplied) {
+        std::cout << "\n    Trust-region parameters";
+        if (is_trdog && !tr_supplied) {
+            std::cout << " (using defaults)";
+        }
+        else if (!is_trdog && tr_supplied) {
+            std::cout << " (WARNING: supplied but solver is not TRDOG)";
+        }
+        std::cout << ":\n";
+
+        // Use the supplied options if present, otherwise default-construct
+        // a TrustRegionOptions to print the defaults
+        const TrustRegionOptions tr_opts = tr_supplied
+            ? solvers.nonlinear_solver.trust_region.value()
+            : TrustRegionOptions{};
+
+        std::cout << "      delta_init      = " << tr_opts.delta_init      << "\n";
+        std::cout << "      delta_min       = " << tr_opts.delta_min       << "\n";
+        std::cout << "      delta_max       = " << tr_opts.delta_max       << "\n";
+        std::cout << "      xi_lg           = " << tr_opts.xi_lg           << "\n";
+        std::cout << "      xi_ug           = " << tr_opts.xi_ug           << "\n";
+        std::cout << "      xi_lo           = " << tr_opts.xi_lo           << "\n";
+        std::cout << "      xi_uo           = " << tr_opts.xi_uo           << "\n";
+        std::cout << "      xi_inc          = " << tr_opts.xi_inc          << "\n";
+        std::cout << "      xi_dec          = " << tr_opts.xi_dec          << "\n";
+        std::cout << "      xi_forced_inc   = " << tr_opts.xi_forced_inc   << "\n";
+        std::cout << "      reject_increase = "
+                  << (tr_opts.reject_increase ? "true" : "false") << "\n";
+    }
 }
 
 void ExaOptions::print_material_options() const {
diff --git a/src/options/option_parser_v2.hpp b/src/options/option_parser_v2.hpp
index d38fac7..9f77d13 100644
--- a/src/options/option_parser_v2.hpp
+++ b/src/options/option_parser_v2.hpp
@@ -97,9 +97,10 @@ enum class LinearSolverType {
  * @brief Enumeration for nonlinear solver types
  */
 enum class NonlinearSolverType {
-    NR,    /**< Newton-Raphson method */
-    NRLS,  /**< Newton-Raphson with line search */
-    NOTYPE /**< Uninitialized or invalid nonlinear solver type */
+    NR,     /**< Newton-Raphson method */
+    NRLS,   /**< Newton-Raphson with line search */
+    TRDOG,  /**< Trust-region dogleg method (ported from SNLS) */
+    NOTYPE  /**< Uninitialized or invalid nonlinear solver type */
 };
 
 /**
@@ -623,6 +624,103 @@ struct LinearSolverOptions {
     static LinearSolverOptions from_toml(const toml::value& toml_input);
 };
 
+/**
+ * @brief Trust-region dogleg solver configuration
+ *
+ * @details Controls the trust-region radius management and dogleg step
+ * computation for the ExaTrustRegionSolver. Parameters are ported from
+ * SNLS's TrDeltaControl with sane defaults suitable for solid mechanics
+ * applications. Power users can tune these for difficult crystal plasticity
+ * problems.
+ *
+ * The trust-region radius delta is updated based on the ratio
+ *     rho = actual_residual_change / predicted_residual_change
+ * where predicted change comes from the linearized model at the current iterate.
+ *
+ * Acceptance/rejection bands:
+ *   - "Good" band [xi_lg, xi_ug]: increase delta when rho falls here
+ *   - "OK"  band [xi_lo, xi_uo]: keep delta when rho falls here (outside good)
+ *   - Outside [xi_lo, xi_uo]: decrease delta
+ *
+ * TOML configuration example:
+ * @code
+ * [Solvers.NR.trust_region]
+ *     delta_init      = 1.0
+ *     delta_min       = 1e-12
+ *     delta_max       = 1e4
+ *     xi_lg           = 0.75
+ *     xi_ug           = 1.4
+ *     xi_lo           = 0.35
+ *     xi_uo           = 5.0
+ *     xi_inc          = 1.5
+ *     xi_dec          = 0.25
+ *     xi_forced_inc   = 1.2
+ *     reject_increase = true
+ * @endcode
+ */
+struct TrustRegionOptions {
+    /**
+     * @brief Initial trust-region radius
+     */
+    double delta_init = 1.0;
+
+    /**
+     * @brief Minimum allowed trust-region radius. Solver fails if delta drops below this.
+     */
+    double delta_min = 1e-12;
+
+    /**
+     * @brief Maximum allowed trust-region radius
+     */
+    double delta_max = 1e4;
+
+    /**
+     * @brief Lower bound of the "good" rho band (increase delta when rho > xi_lg)
+     */
+    double xi_lg = 0.75;
+
+    /**
+     * @brief Upper bound of the "good" rho band
+     */
+    double xi_ug = 1.4;
+
+    /**
+     * @brief Lower bound of the "ok" rho band (decrease delta when rho < xi_lo)
+     */
+    double xi_lo = 0.35;
+
+    /**
+     * @brief Upper bound of the "ok" rho band (decrease delta when rho > xi_uo)
+     */
+    double xi_uo = 5.0;
+
+    /**
+     * @brief Factor used to increase delta when a step is accepted in the "good" band
+     */
+    double xi_inc = 1.5;
+
+    /**
+     * @brief Factor used to decrease delta when a step quality is outside the "ok" band
+     */
+    double xi_dec = 0.25;
+
+    /**
+     * @brief Forced-increase factor when the predicted residual change is exactly zero
+     */
+    double xi_forced_inc = 1.2;
+
+    /**
+     * @brief Whether to reject steps that increase the residual norm
+     */
+    bool reject_increase = true;
+
+    // Validation
+    bool validate() const;
+
+    // Conversion from toml
+    static TrustRegionOptions from_toml(const toml::value& toml_input);
+};
+
 /**
  * @brief Nonlinear solver configuration
  */
@@ -647,6 +745,14 @@ struct NonlinearSolverOptions {
      */
     NonlinearSolverType nl_solver = NonlinearSolverType::NR;
 
+    /**
+     * @brief Trust-region configuration (only used when nl_solver == TRDOG).
+     *
+     * If left empty, default TrustRegionOptions values are used. Users with
+     * difficult convergence problems should provide custom values.
+     */
+    std::optional<TrustRegionOptions> trust_region;
+
     // Validation
     bool validate() const;
 
diff --git a/src/options/option_solvers.cpp b/src/options/option_solvers.cpp
index b5f8af7..817b64c 100644
--- a/src/options/option_solvers.cpp
+++ b/src/options/option_solvers.cpp
@@ -39,6 +39,63 @@ LinearSolverOptions LinearSolverOptions::from_toml(const toml::value& toml_input
     return options;
 }
 
+/**
+ * @brief Parse trust-region options from a TOML sub-table.
+ *
+ * Each field is optional — if not present in the TOML, the struct's default
+ * value is preserved. This lets users override only the parameters they need
+ * to tune.
+ */
+TrustRegionOptions TrustRegionOptions::from_toml(const toml::value& toml_input) {
+    TrustRegionOptions options;
+
+    if (toml_input.contains("delta_init")) {
+        options.delta_init = toml::find<double>(toml_input, "delta_init");
+    }
+
+    if (toml_input.contains("delta_min")) {
+        options.delta_min = toml::find<double>(toml_input, "delta_min");
+    }
+
+    if (toml_input.contains("delta_max")) {
+        options.delta_max = toml::find<double>(toml_input, "delta_max");
+    }
+
+    if (toml_input.contains("xi_lg")) {
+        options.xi_lg = toml::find<double>(toml_input, "xi_lg");
+    }
+
+    if (toml_input.contains("xi_ug")) {
+        options.xi_ug = toml::find<double>(toml_input, "xi_ug");
+    }
+
+    if (toml_input.contains("xi_lo")) {
+        options.xi_lo = toml::find<double>(toml_input, "xi_lo");
+    }
+
+    if (toml_input.contains("xi_uo")) {
+        options.xi_uo = toml::find<double>(toml_input, "xi_uo");
+    }
+
+    if (toml_input.contains("xi_inc")) {
+        options.xi_inc = toml::find<double>(toml_input, "xi_inc");
+    }
+
+    if (toml_input.contains("xi_dec")) {
+        options.xi_dec = toml::find<double>(toml_input, "xi_dec");
+    }
+
+    if (toml_input.contains("xi_forced_inc")) {
+        options.xi_forced_inc = toml::find<double>(toml_input, "xi_forced_inc");
+    }
+
+    if (toml_input.contains("reject_increase")) {
+        options.reject_increase = toml::find<bool>(toml_input, "reject_increase");
+    }
+
+    return options;
+}
+
 NonlinearSolverOptions NonlinearSolverOptions::from_toml(const toml::value& toml_input) {
     NonlinearSolverOptions options;
 
@@ -59,6 +116,14 @@ NonlinearSolverOptions NonlinearSolverOptions::from_toml(const toml::value& toml
             toml::find<std::string>(toml_input, "nl_solver"));
     }
 
+    // Parse the optional trust-region sub-table when using the dogleg solver.
+    // We always parse the table if present (regardless of nl_solver) so that
+    // options validation can flag inconsistent configurations later.
+    if (toml_input.contains("trust_region")) {
+        options.trust_region = TrustRegionOptions::from_toml(
+            toml::find(toml_input, "trust_region"));
+    }
+
     return options;
 }
 
@@ -123,6 +188,75 @@ bool LinearSolverOptions::validate() const {
     return true;
 }
 
+/**
+ * @brief Validate trust-region option ranges and consistency.
+ *
+ * Step-by-step verification:
+ *   1. Trust-region radius bounds: delta_min must be positive and delta_max
+ *      must exceed delta_min
+ *   2. Initial radius must lie within [delta_min, delta_max]
+ *   3. The "good" rho band [xi_lg, xi_ug] must lie inside the "ok" band
+ *      [xi_lo, xi_uo] — otherwise the radius update logic is inconsistent
+ *   4. Increase factors must be > 1 and decrease factor must be in (0, 1)
+ *
+ * Each failure is reported with WARNING_0_OPT pointing to the offending field.
+ */
+bool TrustRegionOptions::validate() const {
+    if (delta_min <= 0.0) {
+        WARNING_0_OPT("Error: TrustRegion table provided a non-positive delta_min");
+        return false;
+    }
+
+    if (delta_max <= delta_min) {
+        WARNING_0_OPT("Error: TrustRegion table provided delta_max <= delta_min");
+        return false;
+    }
+
+    if (delta_init < delta_min || delta_init > delta_max) {
+        WARNING_0_OPT("Error: TrustRegion table provided delta_init outside [delta_min, delta_max]");
+        return false;
+    }
+
+    if (xi_lg <= xi_lo) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_lg > xi_lo "
+                      "(good band must lie inside ok band)");
+        return false;
+    }
+
+    if (xi_ug >= xi_uo) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_ug < xi_uo "
+                      "(good band must lie inside ok band)");
+        return false;
+    }
+
+    if (xi_lg >= xi_ug) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_lg < xi_ug");
+        return false;
+    }
+
+    if (xi_lo >= xi_uo) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_lo < xi_uo");
+        return false;
+    }
+
+    if (xi_inc <= 1.0) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_inc > 1.0");
+        return false;
+    }
+
+    if (xi_dec <= 0.0 || xi_dec >= 1.0) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_dec in (0, 1)");
+        return false;
+    }
+
+    if (xi_forced_inc <= 1.0) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_forced_inc > 1.0");
+        return false;
+    }
+
+    return true;
+}
+
 bool NonlinearSolverOptions::validate() const {
     if (iter < 1) {
         WARNING_0_OPT("Error: NonLinearSolver table did not provide a positive iteration count");
@@ -139,13 +273,23 @@ bool NonlinearSolverOptions::validate() const {
         return false;
     }
 
-    if (nl_solver != NonlinearSolverType::NR && nl_solver != NonlinearSolverType::NRLS) {
-        WARNING_0_OPT("Error: NonLinearSolver table did not provide a valid nl_solver option (`NR` "
-                      "or `NRLS`)");
+    if (nl_solver != NonlinearSolverType::NR &&
+        nl_solver != NonlinearSolverType::NRLS &&
+        nl_solver != NonlinearSolverType::TRDOG) {
+        WARNING_0_OPT("Error: NonLinearSolver table did not provide a valid nl_solver option "
+                      "(`NR`, `NRLS`, or `TRDOG`)");
         return false;
     }
 
-    // Implement validation logic
+    // If trust-region parameters were supplied, verify they are self-consistent.
+    // We allow a TRDOG solver without a [trust_region] sub-table — the defaults
+    // are applied in that case.
+    if (trust_region.has_value()) {
+        if (!trust_region->validate()) {
+            return false;
+        }
+    }
+
     return true;
 }
 
diff --git a/src/postprocessing/postprocessing_driver.cpp b/src/postprocessing/postprocessing_driver.cpp
index 8212eb2..138e828 100644
--- a/src/postprocessing/postprocessing_driver.cpp
+++ b/src/postprocessing/postprocessing_driver.cpp
@@ -531,17 +531,15 @@ void PostProcessingDriver::UpdateFields([[maybe_unused]] const int step,
 void PostProcessingDriver::Update(const int step, const double time) {
     CALI_CXX_MARK_SCOPE("postprocessing_update");
     UpdateFields(step, time);
-    // Check if we should output volume averages at this step
-    if (ShouldOutputAtStep(step)) {
-        PrintVolValues(time, m_aggregation_mode);
-        ClearVolumeAverageCache();
-    }
 
     // Update data collections for visualization
-    if (m_enable_visualization) {
+    if (ShouldOutputAtStep(step) && m_enable_visualization) {
         UpdateDataCollections(step, time);
     }
 
+    PrintVolValues(time, m_aggregation_mode);
+    ClearVolumeAverageCache();
+
     if (m_light_up_instances.size() > 0) {
         UpdateLightUpAnalysis();
     }
@@ -1281,7 +1279,7 @@ void PostProcessingDriver::CalcElementAvg(mfem::expt::PartialQuadratureFunction*
 
     // KEY DIFFERENCE: Get the local-to-global element mapping for partial space
     auto l2g = pqs->GetLocal2Global().Read();    // Maps local element index to global element index
-    auto loc_offsets = pqs->getOffsets().Read(); // Offsets for local data layout
+    auto loc_offsets = pqs->Offsets(mfem::QSpaceOffsetStorage::COMPRESSED).Read(); // Offsets for local data layout
     // auto global_offsets = (pqs->GetGlobalOffset().Size() > 1) ?
     //                        pqs->GetGlobalOffset().Read() : loc_offsets; // Offsets for global
     //                        data layout
@@ -1393,6 +1391,9 @@ void PostProcessingDriver::InitializeGridFunctions() {
                     const auto gf_name = GetGridFunctionName(reg.display_name, reg_int);
                     // Determine vector dimension from quadrature function
                     const int vdim = reg.region_length[region];
+                    if (vdim < 1) {
+                        continue;
+                    }
                     max_vdim = (vdim > max_vdim) ? vdim : max_vdim;
                     auto fe_space = GetParFiniteElementSpace(reg_int, vdim);
                     m_map_gfs.emplace(gf_name,
@@ -1467,18 +1468,31 @@ void PostProcessingDriver::InitializeDataCollections(ExaOptions& options) {
         return input.substr(0, pos);
     };
 
+    auto has_registered_fields = [this](const std::string& display_region_postfix) {
+        for (const auto& [key, value] : m_map_gfs) {
+            (void)value;
+            if (key.find(display_region_postfix) != std::string::npos) {
+                return true;
+            }
+        }
+        return false;
+    };
+
     if (m_aggregation_mode == AggregationMode::PER_REGION ||
         m_aggregation_mode == AggregationMode::BOTH) {
         for (int region = 0; region < static_cast<int>(m_num_regions); ++region) {
             auto mesh = m_map_submesh[region];
             std::string region_postfix = "region_" + std::to_string(region + 1);
             std::string display_region_postfix = " " + m_sim_state->GetRegionDisplayName(region);
+            if (!has_registered_fields(display_region_postfix)) {
+                continue;
+            }
             fs::path output_dir = output_dir_base / region_postfix;
             fs::path output_dir_vizs = output_dir / m_file_manager->GetBaseFilename();
-            if (m_sim_state->IsRegionActive(region)) {
-                auto region_comm = m_sim_state->GetRegionCommunicator(region);
-                m_file_manager->EnsureDirectoryExists(output_dir, region_comm);
-            }
+            // The subsequent DataCollection::Save() is a parallel operation on the submesh's
+            // communicator, which is still the parent MPI communicator. Prepare directories on
+            // that same communicator so all participating ranks observe the same path state.
+            m_file_manager->EnsureDirectoryExists(output_dir, MPI_COMM_WORLD);
             std::vector<std::string> dcs_keys;
             if (options.visualization.visit) {
                 std::string key = visit_key + region_postfix;
@@ -1534,6 +1548,9 @@ void PostProcessingDriver::InitializeDataCollections(ExaOptions& options) {
 
         std::string region_postfix = "global";
         std::string display_region_postfix = " " + m_sim_state->GetRegionDisplayName(-1);
+        if (!has_registered_fields(display_region_postfix)) {
+            return;
+        }
         fs::path output_dir = output_dir_base / region_postfix;
         fs::path output_dir_vizs = output_dir / m_file_manager->GetBaseFilename();
         m_file_manager->EnsureDirectoryExists(output_dir);
diff --git a/src/postprocessing/postprocessing_file_manager.hpp b/src/postprocessing/postprocessing_file_manager.hpp
index f070029..3784f31 100644
--- a/src/postprocessing/postprocessing_file_manager.hpp
+++ b/src/postprocessing/postprocessing_file_manager.hpp
@@ -186,7 +186,7 @@ class PostProcessingFileManager {
         auto filepath = GetVolumeAverageFilePath(calc_type, region, region_name);
 
         bool file_exists = fs::exists(filepath);
-        auto file = CreateOutputFile(filepath, true);
+        auto file = CreateOutputFile(filepath, true, comm);
 
         if (file && file->is_open()) {
             if (!file_exists) {
@@ -452,6 +452,7 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di
     int rank;
     MPI_Comm_rank(comm, &rank);
     bool success = false;
+    std::string path_str;
     if (rank == 0) {
         try {
             // Use weakly_canonical to resolve as much as possible
@@ -474,6 +475,7 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di
                 } else {
                     std::cout << "Using existing directory: " << canonical_path << std::endl;
                     output_dir = canonical_path;
+                    path_str = canonical_path.string();
                     success = true;
                 }
             } else {
@@ -482,6 +484,8 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di
                 success = fs::create_directories(canonical_path);
                 if (success) {
                     output_dir = canonical_path;
+                    path_str = canonical_path.string();
+
                 } else {
                     std::cerr << "Warning: Failed to create output directory: " << canonical_path
                               << std::endl;
@@ -513,15 +517,17 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di
     }
 
     // Broadcast the potentially updated output_dir to all ranks
-    std::string path_str = output_dir.string();
     int dir_length = static_cast<int>(path_str.length());
     MPI_Bcast(&dir_length, 1, MPI_INT, 0, comm);
-    path_str.resize(static_cast<size_t>(dir_length));
-    MPI_Bcast(&path_str[0], dir_length, MPI_CHAR, 0, comm);
-    output_dir = path_str;
+    if (dir_length > 0) {
+        path_str.resize(static_cast<size_t>(dir_length));
+        MPI_Bcast(path_str.data(), dir_length, MPI_CHAR, 0, comm);
+        output_dir = path_str;
+    }
 
     bool success_t = false;
-    MPI_Allreduce(&success, &success_t, 1, MPI_C_BOOL, MPI_LOR, comm);
+    MPI_Bcast(&success, 1, MPI_C_BOOL, 0, comm);
+    success_t = success;
     return success_t;
 }
 
diff --git a/src/solvers/trust_region_solver.cpp b/src/solvers/trust_region_solver.cpp
new file mode 100644
index 0000000..77fd9fe
--- /dev/null
+++ b/src/solvers/trust_region_solver.cpp
@@ -0,0 +1,350 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and
+// other ExaConstit Project Developers. See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: MIT
+
+#include "solvers/trust_region_solver.hpp"
+
+#include "utilities/mechanics_log.hpp"
+#include "utilities/unified_logger.hpp"
+
+#include "mfem.hpp"
+#include "mfem/general/globals.hpp"
+#include "mfem/linalg/linalg.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+
+/**
+ * @brief Compute the Powell dogleg step inside the trust region.
+ *
+ * @details Step-by-step algorithm:
+ *
+ * 1. **Full Newton step inside trust region**:
+ *    If ||s_N|| <= delta, take the full Newton step. The predicted residual
+ *    is zero (the linear model F + J*s_N = 0 is exactly satisfied).
+ *
+ * 2. **Cauchy point outside trust region**:
+ *    Compute the Cauchy point parameters:
+ *       - alpha = ||g||^2 / ||J*g||^2   (optimal scaling along steepest descent)
+ *       - ||s_sd_opt|| = alpha * ||g||  (norm of the optimal Cauchy step)
+ *    If ||s_sd_opt|| >= delta, the optimal Cauchy point is outside the trust
+ *    region. Step along the steepest descent direction to the boundary:
+ *       delx = -delta * g / ||g||
+ *    The predicted residual norm is computed from the linear model evaluated
+ *    at this truncated Cauchy step.
+ *
+ * 3. **Dogleg interpolation (second leg)**:
+ *    Otherwise, interpolate along the line segment from the Cauchy point to
+ *    the Newton point, finding the parameter beta in [0, 1] such that the
+ *    interpolated step lies on the trust-region boundary. The intersection
+ *    is found by solving a quadratic:
+ *       delx(beta) = beta * s_N - (1 - beta) * alpha * g
+ *       ||delx(beta)||^2 = delta^2
+ *    yielding qa*beta^2 - 2*qb*beta + qc = 0 where:
+ *       qa = ||p||^2,   qb = alpha * (p . g),   qc = ||s_sd_opt||^2 - delta^2
+ *       and p = s_N + alpha * g.
+ *    Beta is taken from the larger root and clamped to [0, 1] for safety.
+ */
+void ExaTrustRegionSolver::Dogleg(double delta, double res_0, double nr_norm,
+                                  double Jg_2, const mfem::Vector &grad,
+                                  const mfem::Vector &nrStep, mfem::Vector &delx,
+                                  double &pred_resid, bool &use_nr) const
+{
+   use_nr = false;
+
+   // --- Case 1: Full Newton step fits inside the trust region ---
+   if (nr_norm <= delta) {
+      use_nr = true;
+      delx = nrStep;
+      pred_resid = 0.0;
+
+      if (print_level > 0) {
+         mfem::out << "TR dogleg: taking full Newton step (||s_N|| = "
+                   << nr_norm << " <= delta = " << delta << ")\n";
+      }
+      return;
+   }
+
+   // Cauchy point parameters using MPI-aware dot products
+   const double norm2_grad = Dot(grad, grad);
+   const double norm_grad = std::sqrt(norm2_grad);
+
+   const double alpha = (Jg_2 > 0.0) ? (norm2_grad / Jg_2) : 1.0;
+   const double norm_grad_inv = (norm_grad > 0.0) ? (1.0 / norm_grad) : 1.0;
+   const double norm_s_sd_opt = alpha * norm_grad;
+
+   // --- Case 2: Cauchy point is outside the trust region ---
+   // Take a step along the steepest descent direction to the trust-region boundary
+   if (norm_s_sd_opt >= delta) {
+      // delx = -delta * (grad / ||grad||)
+      const double factor = -delta * norm_grad_inv;
+      delx = grad;
+      delx *= factor;
+
+      // Predicted residual from linear model at the truncated Cauchy step
+      const double val = -(delta * norm_grad) +
+                         0.5 * delta * delta * Jg_2 *
+                         (norm_grad_inv * norm_grad_inv);
+      pred_resid = std::sqrt(std::max(2.0 * val + res_0 * res_0, 0.0));
+
+      if (print_level > 0) {
+         mfem::out << "TR dogleg: stepping along first leg (steepest descent)\n";
+      }
+   }
+   // --- Case 3: Cauchy inside, Newton outside; interpolate along the second leg ---
+   else {
+      // Reuse delx as workspace for p = nrStep + alpha * grad
+      mfem::Vector &p = delx;
+      add(nrStep, alpha, grad, p);
+
+      // Quadratic coefficients for the trust-region boundary intersection
+      double qa = Dot(p, p);
+      double qb = Dot(p, grad) * alpha;
+      double qc = norm_s_sd_opt * norm_s_sd_opt - delta * delta;
+
+      double discriminant = qb * qb - qa * qc;
+      double beta = (qa > 0.0)
+         ? (qb + std::sqrt(std::max(discriminant, 0.0))) / qa
+         : 0.0;
+
+      // Clamp beta to [0, 1] to handle any roundoff at the boundary
+      beta = std::max(0.0, std::min(1.0, beta));
+
+      // delx = beta * nrStep - (1 - beta) * alpha * grad
+      const double omb = 1.0 - beta;
+      const double omba = omb * alpha;
+      add(beta, nrStep, -omba, grad, delx);
+
+      // Predicted residual from linear model at the dogleg step
+      const double res_cauchy = (Jg_2 > 0.0)
+         ? std::sqrt(std::max(res_0 * res_0 - alpha * norm2_grad, 0.0))
+         : res_0;
+      pred_resid = omb * res_cauchy;
+
+      if (print_level > 0) {
+         mfem::out << "TR dogleg: stepping along second leg (beta = "
+                   << beta << ")\n";
+      }
+   }
+}
+
+/**
+ * @brief Trust-region dogleg Newton iteration implementation.
+ *
+ * @details Step-by-step algorithm for solving F(x) = b:
+ *
+ * **Initial setup**:
+ *   1. Validate that operator (oper_mech), preconditioner (prec_mech), and
+ *      delta_ctrl are properly configured
+ *   2. Allocate all device-aware working vectors (nrStep, grad, delx, Jg_temp,
+ *      x_prev) once before the iteration loop
+ *   3. Evaluate initial residual r = F(x) - b and compute its norm
+ *   4. Set the convergence threshold norm_max = max(rel_tol * res, abs_tol)
+ *   5. Initialize trust-region radius delta from delta_ctrl.deltaInit
+ *
+ * **Main iteration loop** (until convergence or max_iter):
+ *   1. If the previous step was *not* rejected, recompute Newton machinery:
+ *      a. Get Jacobian J = oper_mech->GetGradient(x). The material state is
+ *         consistent with x because Mult(x, r) was just evaluated.
+ *      b. Compute steepest descent: grad = J^T * r (gradient of f = 0.5 ||F||^2)
+ *      c. Compute Jg_2 = ||J * grad||^2 for the optimal Cauchy step length
+ *      d. Solve the Newton system J*c = r via the Krylov solver (prec_mech),
+ *         then negate: nrStep = -c. The negation matches SNLS convention where
+ *         the Newton update is x += nrStep (whereas ExaNewtonSolver uses x -= c).
+ *      e. Compute nr_norm = ||nrStep||
+ *      If the previous step *was* rejected, all of this data is still valid
+ *      from the last accepted iteration and we just recompute the dogleg with
+ *      the smaller delta.
+ *   2. Save x_prev = x for potential rollback on rejection
+ *   3. Compute the dogleg step delx via Dogleg() helper
+ *   4. Apply the trial step: x = x_prev + delx
+ *   5. Evaluate residual at the trial point: r = F(x) - b
+ *   6. Check convergence: if ||r|| <= norm_max, accept and exit
+ *   7. Update delta via delta_ctrl.UpdateDelta() based on actual vs predicted
+ *      reduction. This may also flag the step for rejection.
+ *   8. If rejected: restore x = x_prev, restore residual norm, set reject_prev.
+ *      The material state inside the model handles itself analogously to the
+ *      ExaNewtonLSSolver line-search behavior — when Mult() is called again at
+ *      the next trial point, the model recomputes from the beginning-step state.
+ *
+ * **Performance Profiling**:
+ *   - "TR_dogleg_solver" scope for overall trust-region solver performance
+ *   - "TR_newton_setup" scope for J^T*r and J*g computations
+ *   - "TR_gradient_transpose" scope for the J^T*r call specifically
+ *   - "TR_newton_solve" scope for the Krylov inner solve
+ *   - "TR_trial_eval" scope for residual evaluations at trial points
+ *   - "krylov_solver" scope for the actual Krylov solver call
+ *
+ * @note All scalar quantities (norms, dot products) use MFEM's MPI-aware
+ *       Norm() and Dot() functions through the IterativeSolver base class
+ */
+void ExaTrustRegionSolver::Mult(const mfem::Vector &b, mfem::Vector &x) const
+{
+   CALI_CXX_MARK_SCOPE("TR_dogleg_solver");
+   MFEM_ASSERT_0(oper_mech, "the Operator is not set (use SetOperator).");
+   MFEM_ASSERT_0(prec_mech, "the Solver is not set (use SetSolver).");
+   MFEM_ASSERT(delta_ctrl.Validate(), "TrDeltaControl parameters are invalid.");
+
+   const bool have_b = (b.Size() == Height());
+
+   // --- Allocate working vectors once, reused across iterations ---
+   mfem::Vector nrStep(width, mfem::Device::GetMemoryType());
+   mfem::Vector grad(width, mfem::Device::GetMemoryType());
+   mfem::Vector delx(width, mfem::Device::GetMemoryType());
+   mfem::Vector Jg_temp(width, mfem::Device::GetMemoryType());
+   mfem::Vector x_prev(width, mfem::Device::GetMemoryType());
+
+   nrStep.UseDevice(true);
+   grad.UseDevice(true);
+   delx.UseDevice(true);
+   Jg_temp.UseDevice(true);
+   x_prev.UseDevice(true);
+
+   // --- Initial residual evaluation: r = F(x) - b ---
+   oper_mech->Mult(x, r);
+   if (have_b) { r -= b; }
+
+   double res = Norm(r);
+   double res_0 = res;
+   const double norm_max = std::max(rel_tol * res, abs_tol);
+
+   if (print_level >= 0) {
+      mfem::out << "TR dogleg: initial ||r|| = " << res << "\n";
+   }
+
+   if (res <= norm_max) {
+      converged = true;
+      final_iter = 0;
+      final_norm = res;
+      return;
+   }
+
+   // --- Initialize trust-region state ---
+   double delta = delta_ctrl.deltaInit;
+   double rho = 0.0;
+   bool reject_prev = false;
+
+   // Persisted across iterations when a step is not rejected
+   double Jg_2 = 0.0;
+   double nr_norm = 0.0;
+
+   int it = 0;
+   converged = false;
+
+   // --- Main iteration loop ---
+   while (it < max_iter) {
+      it++;
+
+      // If the previous step was not rejected, recompute Newton direction
+      // and steepest descent direction at the current x. The Jacobian data
+      // is current because oper_mech->Mult(x, r) was just called.
+      if (!reject_prev) {
+         CALI_CXX_MARK_SCOPE("TR_newton_setup");
+
+         mfem::Operator &J = oper_mech->GetGradient(x);
+
+         // Steepest descent direction: grad = J^T * r
+         // This is the gradient of the merit function f(x) = 0.5 * ||F(x)||^2
+         {
+            CALI_CXX_MARK_SCOPE("TR_gradient_transpose");
+            J.MultTranspose(r, grad);
+         }
+
+         // Compute ||J * grad||^2 for the optimal Cauchy step length
+         //    alpha_cauchy = ||grad||^2 / ||J*grad||^2
+         {
+            J.Mult(grad, Jg_temp);
+            Jg_2 = Dot(Jg_temp, Jg_temp);
+         }
+
+         // Solve Newton system: J * c = r, then nrStep = -c
+         // CGSolver follows the same convention as ExaNewtonSolver where the
+         // Krylov solve produces c such that the Newton update would be x -= c.
+         // For the dogleg we need nrStep = -J^{-1}*r, so we negate after the solve.
+         {
+            CALI_CXX_MARK_SCOPE("TR_newton_solve");
+            c = 0.0;
+            this->CGSolver(J, r, c);
+            nrStep = c;
+            nrStep.Neg();
+         }
+
+         nr_norm = Norm(nrStep);
+      }
+
+      // Save state for potential step rejection
+      x_prev = x;
+
+      // Compute the dogleg step
+      double pred_resid = 0.0;
+      bool use_nr = false;
+      Dogleg(delta, res_0, nr_norm, Jg_2, grad, nrStep,
+             delx, pred_resid, use_nr);
+
+      // Apply the trial step: x = x_prev + delx
+      x = x_prev;
+      x += delx;
+
+      // Evaluate residual at the trial point
+      reject_prev = false;
+      {
+         CALI_CXX_MARK_SCOPE("TR_trial_eval");
+         oper_mech->Mult(x, r);
+         if (have_b) { r -= b; }
+      }
+
+      res = Norm(r);
+
+      if (print_level >= 0) {
+         mfem::out << "TR dogleg: iter " << it
+                   << ", ||r|| = " << res
+                   << ", delta = " << delta
+                   << (use_nr ? " [NR]" : " [DL]")
+                   << "\n";
+      }
+
+      // Check convergence
+      if (res <= norm_max) {
+         converged = true;
+         break;
+      }
+
+      // Update delta from actual vs predicted reduction. May flag for rejection.
+      bool delta_ok = delta_ctrl.UpdateDelta(
+         delta, res, res_0, pred_resid, reject_prev,
+         use_nr, nr_norm, rho, print_level);
+
+      if (!delta_ok) {
+         if (print_level >= 0) {
+            mfem::out << "TR dogleg: delta control failure at iter " << it << "\n";
+         }
+         converged = false;
+         break;
+      }
+
+      // If the step is rejected, revert x and residual.
+      // On the next iteration, reject_prev == true so we skip the Newton solve
+      // and recompute the dogleg with the updated (smaller) delta. The Jacobian,
+      // grad, nrStep, and Jg_2 are still valid from the last accepted state.
+      if (reject_prev) {
+         if (print_level > 0) {
+            mfem::out << "TR dogleg: rejecting step, reverting to previous state\n";
+         }
+         x = x_prev;
+         res = res_0;
+      }
+
+      res_0 = res;
+   }
+
+   final_iter = it;
+   final_norm = res;
+
+   if (!converged && print_level >= 0) {
+      mfem::out << "TR dogleg: failed to converge in " << it
+                << " iterations, final ||r|| = " << res << "\n";
+   }
+}
\ No newline at end of file
diff --git a/src/solvers/trust_region_solver.hpp b/src/solvers/trust_region_solver.hpp
new file mode 100644
index 0000000..43950d3
--- /dev/null
+++ b/src/solvers/trust_region_solver.hpp
@@ -0,0 +1,357 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and
+// other ExaConstit Project Developers. See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include "solvers/mechanics_solver.hpp"
+
+#include "mfem.hpp"
+#include "mfem/linalg/solvers.hpp"
+
+#include <cmath>
+#include <algorithm>
+#include <memory>
+
+/**
+ * @brief Trust-region radius control parameters for the dogleg solver.
+ *
+ * @details Ported from SNLS's TrDeltaControl. Controls how the trust-region
+ * radius delta is updated based on the ratio rho = actual_reduction / predicted_reduction.
+ *
+ * The update logic:
+ * - If rho is in the "good" band [xiLG, xiUG] and the step reduced the residual,
+ *   increase delta (unless the full Newton step was taken)
+ * - If rho is outside the "ok" band [xiLO, xiUO], decrease delta
+ * - If the predicted change is zero and delta is not at max, force a small increase
+ * - If the residual actually increased, reject the step
+ *
+ * @ingroup ExaConstit_solvers
+ */
+struct TrDeltaControl
+{
+   /// @brief Lower bound of the "good" rho interval (increase delta when rho > xiLG)
+   double xiLG = 0.75;
+   /// @brief Upper bound of the "good" rho interval
+   double xiUG = 1.4;
+   /// @brief Factor by which to increase delta
+   double xiIncDelta = 1.5;
+   /// @brief Lower bound of the "ok" rho interval (decrease delta when rho < xiLO)
+   double xiLO = 0.35;
+   /// @brief Upper bound of the "ok" rho interval (decrease delta when rho > xiUO)
+   double xiUO = 5.0;
+   /// @brief Factor by which to decrease delta
+   double xiDecDelta = 0.25;
+   /// @brief Forced increase factor when predicted change is zero
+   double xiForcedIncDelta = 1.2;
+   /// @brief Initial trust-region radius
+   double deltaInit = 1.0;
+   /// @brief Minimum allowed trust-region radius (solver fails if hit)
+   double deltaMin = 1e-12;
+   /// @brief Maximum allowed trust-region radius
+   double deltaMax = 1e4;
+   /// @brief Whether to reject steps that increase the residual
+   bool rejectResIncrease = true;
+
+   /**
+    * @brief Validate that the control parameters are self-consistent.
+    *
+    * @return true if all parameter relationships are valid, false otherwise
+    *
+    * Verifies the following invariants:
+    * - deltaMin > 0 and deltaMax > deltaMin
+    * - The "good" rho band [xiLG, xiUG] sits inside the "ok" band [xiLO, xiUO]
+    * - The increase factor (xiIncDelta) is greater than 1
+    * - The decrease factor (xiDecDelta) is in (0, 1)
+    * - The forced-increase factor is greater than 1
+    */
+   bool Validate() const
+   {
+      return (deltaMin > 0.0) &&
+             (deltaMax > deltaMin) &&
+             (xiLG > xiLO) &&
+             (xiUG < xiUO) &&
+             (xiIncDelta > 1.0) &&
+             (xiDecDelta > 0.0 && xiDecDelta < 1.0) &&
+             (xiForcedIncDelta > 1.0);
+   }
+
+   /**
+    * @brief Decrease the trust-region radius after a rejected/poor step.
+    *
+    * @param[in,out] delta Current radius, modified on output
+    * @param[in] norm_full Norm of the full Newton step
+    * @param[in] took_full Whether the full Newton step was used at the last iteration
+    * @param[in] print_level Verbosity level for output
+    * @return true if delta is still above deltaMin, false if solver should fail
+    *
+    * @details If the full Newton step was taken, uses a geometric mean blend of
+    * the current delta and the Newton step norm scaled by xiDecDelta. Otherwise
+    * just multiplies delta by xiDecDelta. Returns false (and sets delta to deltaMin)
+    * if the resulting delta drops below the minimum allowed value.
+    */
+   bool DecrDelta(double &delta, double norm_full, bool took_full,
+                  int print_level = 0) const
+   {
+      if (took_full) {
+         double tempa = delta * xiDecDelta;
+         double tempb = norm_full * xiDecDelta;
+         delta = std::sqrt(tempa * tempb);
+      }
+      else {
+         delta *= xiDecDelta;
+      }
+
+      if (delta < deltaMin) {
+         delta = deltaMin;
+         if (print_level >= 0) {
+            mfem::out << "TR: delta at minimum " << delta << "\n";
+         }
+         return false;
+      }
+
+      if (print_level > 0) {
+         mfem::out << "TR: decreased delta to " << delta << "\n";
+      }
+      return true;
+   }
+
+   /**
+    * @brief Increase the trust-region radius after a successful step.
+    *
+    * @param[in,out] delta Current radius, modified on output
+    * @param[in] print_level Verbosity level for output
+    *
+    * @details Multiplies delta by xiIncDelta and clamps at deltaMax.
+    */
+   void IncrDelta(double &delta, int print_level = 0) const
+   {
+      delta *= xiIncDelta;
+      if (delta > deltaMax) {
+         delta = deltaMax;
+         if (print_level > 0) {
+            mfem::out << "TR: delta at maximum " << delta << "\n";
+         }
+      }
+      else if (print_level > 0) {
+         mfem::out << "TR: increased delta to " << delta << "\n";
+      }
+   }
+
+   /**
+    * @brief Update trust-region radius based on actual vs predicted residual change.
+    *
+    * @param[in,out] delta Trust-region radius, modified on output
+    * @param[in] res New residual norm (after the candidate step)
+    * @param[in] res_0 Previous residual norm (before the candidate step)
+    * @param[in] pred_resid Predicted residual norm from the dogleg model
+    * @param[out] reject Whether the step should be rejected (residual increased)
+    * @param[in] took_full Whether the full Newton step was taken
+    * @param[in] norm_full Norm of the full Newton step
+    * @param[out] rho Actual / predicted reduction ratio (output for diagnostics)
+    * @param[in] print_level Verbosity level for output
+    * @return true if the delta update succeeded, false if the solver should fail
+    *
+    * @details Algorithm (ported from SNLS TrDeltaControl::updateDelta):
+    *   1. Compute actual_change = res - res_0 and pred_change = pred_resid - res_0
+    *   2. If pred_change is exactly zero, force delta larger (or fail if at max)
+    *   3. Otherwise compute rho = actual_change / pred_change
+    *   4. If rho is in the "good" band [xiLG, xiUG] and the residual decreased,
+    *      increase delta (unless the full Newton step was already taken)
+    *   5. If rho is outside the "ok" band [xiLO, xiUO], decrease delta
+    *   6. If the residual increased and rejectResIncrease is set, mark for rejection
+    */
+   bool UpdateDelta(double &delta, double res, double res_0,
+                    double pred_resid, bool &reject, bool took_full,
+                    double norm_full, double &rho,
+                    int print_level = 0) const
+   {
+      bool success = true;
+      double actual_change = res - res_0;
+      double pred_change = pred_resid - res_0;
+
+      if (pred_change == 0.0) {
+         if (delta >= deltaMax) {
+            if (print_level >= 0) {
+               mfem::out << "TR: predicted change is zero and delta at max\n";
+            }
+            success = false;
+         }
+         else {
+            if (print_level > 0) {
+               mfem::out << "TR: predicted change is zero, forcing delta larger\n";
+            }
+            delta = std::min(delta * xiForcedIncDelta, deltaMax);
+         }
+      }
+      else {
+         rho = actual_change / pred_change;
+         if (print_level > 0) {
+            mfem::out << "TR: rho = " << rho << "\n";
+         }
+
+         if ((rho > xiLG) && (actual_change < 0.0) && (rho < xiUG)) {
+            // Step is in the "good" band and residual actually decreased
+            if (!took_full) {
+               IncrDelta(delta, print_level);
+            }
+         }
+         else if ((rho < xiLO) || (rho > xiUO)) {
+            // Step quality is outside the acceptable band; shrink delta
+            success = DecrDelta(delta, norm_full, took_full, print_level);
+         }
+      }
+
+      reject = false;
+      // Do not make this >=, may have res and res_0 both zero and that is ok
+      if ((actual_change > 0.0) && rejectResIncrease) {
+         reject = true;
+      }
+
+      return success;
+   }
+};
+
+/**
+ * @brief Trust-region dogleg solver for nonlinear solid mechanics problems.
+ *
+ * @details This class implements a Powell-dogleg trust-region method for solving
+ * nonlinear systems F(x) = b. It extends ExaNewtonSolver and reuses the same
+ * Krylov solver infrastructure (prec_mech) for computing the Newton direction.
+ *
+ * The trust-region method augments standard Newton with a globalization strategy
+ * that interpolates between the steepest descent direction and the full Newton
+ * step, constrained to a trust-region radius delta. Step quality is monitored
+ * via the ratio rho = actual_reduction / predicted_reduction, and delta is
+ * adjusted up or down accordingly.
+ *
+ * This is a direct port of SNLS's SNLSTrDlDenseG solver, lifted from the
+ * material-point dense system to the global FE system.
+ *
+ * Algorithm at each iteration:
+ *   1. Compute steepest descent direction g = J^T * r (gradient of merit f = 0.5 ||F||^2)
+ *   2. Compute ||J*g||^2 for the optimal Cauchy step length
+ *   3. Solve J * c = r for the full Newton direction (using prec_mech Krylov solver)
+ *   4. Compute the dogleg step within the trust region
+ *   5. Evaluate the residual at the trial point
+ *   6. Accept or reject based on the rho ratio; update delta accordingly
+ *
+ * Requirements:
+ * - The gradient operator must support MultTranspose (for J^T*r computation).
+ *   This means the assembly mode must be EA, FA, or PA with the native PA
+ *   transpose kernels enabled.
+ *
+ * @ingroup ExaConstit_solvers
+ */
+class ExaTrustRegionSolver : public ExaNewtonSolver
+{
+   public:
+      /**
+       * @brief Default constructor
+       *
+       * @details Creates an ExaTrustRegionSolver instance for single-processor
+       * execution. The operator and linear solver must be set separately using
+       * SetOperator() and SetSolver(), and the trust-region control parameters
+       * may be customized via SetTrustRegionControl().
+       */
+      ExaTrustRegionSolver() { }
+
+#ifdef MFEM_USE_MPI
+      /**
+       * @brief MPI constructor
+       *
+       * @param _comm MPI communicator for parallel execution
+       *
+       * @details Creates an ExaTrustRegionSolver instance for parallel execution
+       * using the specified MPI communicator. All trust-region scalar quantities
+       * (norms, dot products) use MPI-aware reductions through MFEM's Dot/Norm.
+       */
+      ExaTrustRegionSolver(MPI_Comm _comm) : ExaNewtonSolver(_comm) { }
+#endif
+
+      /** @brief Use parent class SetOperator methods */
+      using ExaNewtonSolver::SetOperator;
+
+      /** @brief Use parent class SetSolver methods */
+      using ExaNewtonSolver::SetSolver;
+
+      /** @brief Use parent class CGSolver method (Krylov solve wrapper) */
+      using ExaNewtonSolver::CGSolver;
+
+      /**
+       * @brief Set trust-region control parameters.
+       *
+       * @param ctrl TrDeltaControl struct with all tuning parameters
+       *
+       * @details Replaces the internal control parameters with a user-supplied
+       * configuration. Typically called after construction (and before Mult())
+       * to wire up parameters parsed from the TOML configuration file.
+       */
+      void SetTrustRegionControl(const TrDeltaControl &ctrl)
+      {
+         delta_ctrl = ctrl;
+      }
+
+      /**
+       * @brief Get a mutable reference to the trust-region control parameters.
+       * @return Reference to the internal TrDeltaControl
+       */
+      TrDeltaControl& GetTrustRegionControl() { return delta_ctrl; }
+
+      /**
+       * @brief Get a const reference to the trust-region control parameters.
+       * @return Const reference to the internal TrDeltaControl
+       */
+      const TrDeltaControl& GetTrustRegionControl() const { return delta_ctrl; }
+
+      /**
+       * @brief Solve the nonlinear system F(x) = b using trust-region dogleg method.
+       *
+       * @param b Right-hand side vector (if b.Size() != Height(), assumes b = 0)
+       * @param x Solution vector (input: initial guess, output: converged solution)
+       *
+       * @details Implements the trust-region dogleg algorithm. See class-level
+       * documentation for the algorithm description. The Newton direction is
+       * computed by the Krylov solver wired in via SetSolver(); J^T*r is
+       * computed by calling MultTranspose() on the gradient operator.
+       *
+       * @pre SetOperator() and SetSolver() must be called before Mult()
+       * @pre The gradient operator must support MultTranspose (EA/FA mode, or
+       *      PA mode with native transpose kernels)
+       *
+       * @post final_iter contains the number of iterations performed
+       * @post final_norm contains the final residual norm
+       * @post converged flag indicates whether the solver converged
+       */
+      virtual void Mult(const mfem::Vector &b, mfem::Vector &x) const;
+
+   private:
+      /**
+       * @brief Compute the dogleg step given the current trust-region radius.
+       *
+       * @param[in] delta Trust-region radius
+       * @param[in] res_0 Current residual norm
+       * @param[in] nr_norm Norm of the full Newton step
+       * @param[in] Jg_2 ||J*g||^2 where g is the steepest descent direction
+       * @param[in] grad Steepest descent direction g = J^T * r
+       * @param[in] nrStep Full Newton step
+       * @param[out] delx The computed dogleg step
+       * @param[out] pred_resid Predicted residual norm after the step
+       * @param[out] use_nr Whether the full Newton step was taken
+       *
+       * @details Ported from SNLS's dogleg() kernel. The dogleg path interpolates
+       * between the steepest descent direction (Cauchy point) and the full Newton
+       * step. Three cases are handled:
+       *   - Newton step inside delta: take full Newton step
+       *   - Cauchy point outside delta: step along steepest descent to boundary
+       *   - Cauchy inside, Newton outside: solve quadratic for the dogleg leg
+       *     intersection with the trust-region boundary
+       */
+      void Dogleg(double delta, double res_0, double nr_norm,
+                  double Jg_2, const mfem::Vector &grad,
+                  const mfem::Vector &nrStep, mfem::Vector &delx,
+                  double &pred_resid, bool &use_nr) const;
+
+      /// @brief Trust-region control parameters (mutable to allow tuning)
+      mutable TrDeltaControl delta_ctrl;
+};
\ No newline at end of file
diff --git a/src/system_driver.cpp b/src/system_driver.cpp
index 15f4e2b..d7a6934 100644
--- a/src/system_driver.cpp
+++ b/src/system_driver.cpp
@@ -3,6 +3,7 @@
 
 #include "boundary_conditions/BCData.hpp"
 #include "boundary_conditions/BCManager.hpp"
+#include "solvers/trust_region_solver.hpp"
 #include "utilities/mechanics_kernels.hpp"
 #include "utilities/mechanics_log.hpp"
 #include "utilities/unified_logger.hpp"
@@ -45,6 +46,13 @@ void DirBdrFunc(int attr_id, mfem::Vector& y) {
 
 namespace {
 
+void GetTrueDofsParallel(const mfem::ParGridFunction& gf, mfem::Vector& true_dofs) {
+    // used to do something like:
+    // gf.GetTrueDofs(true_dofs);
+    // but looks like there are issues with that on the GPUs with newer versions of MFEM
+    gf.ParallelAverage(true_dofs);
+}
+
 /**
  * @brief Helper function to find mesh bounding box for velocity gradient calculations
  *
@@ -290,29 +298,11 @@ SystemDriver::SystemDriver(std::shared_ptr<SimulationState> sim_state)
     } else {
         if (linear_solvers.preconditioner == PreconditionerType::AMG) {
             auto prec_amg = std::make_shared<mfem::HypreBoomerAMG>();
-            HYPRE_Solver h_amg = static_cast<HYPRE_Solver>(*prec_amg);
-            HYPRE_Real st_val = 0.90;
-            HYPRE_Real rt_val = -10.0;
-            // HYPRE_Real om_val = 1.0;
-            //
-            [[maybe_unused]] int ml = HYPRE_BoomerAMGSetMaxLevels(h_amg, 30);
-            ml = HYPRE_BoomerAMGSetCoarsenType(h_amg, 0);
-            ml = HYPRE_BoomerAMGSetMeasureType(h_amg, 0);
-            ml = HYPRE_BoomerAMGSetStrongThreshold(h_amg, st_val);
-            ml = HYPRE_BoomerAMGSetNumSweeps(h_amg, 3);
-            ml = HYPRE_BoomerAMGSetRelaxType(h_amg, 8);
-            // int rwt = HYPRE_BoomerAMGSetRelaxWt(h_amg, rt_val);
-            // int ro = HYPRE_BoomerAMGSetOuterWt(h_amg, om_val);
-            // Dimensionality of our problem
-            ml = HYPRE_BoomerAMGSetNumFunctions(h_amg, 3);
-            ml = HYPRE_BoomerAMGSetSmoothType(h_amg, 6);
-            ml = HYPRE_BoomerAMGSetSmoothNumLevels(h_amg, 3);
-            ml = HYPRE_BoomerAMGSetSmoothNumSweeps(h_amg, 3);
-            ml = HYPRE_BoomerAMGSetVariant(h_amg, 0);
-            ml = HYPRE_BoomerAMGSetOverlap(h_amg, 0);
-            ml = HYPRE_BoomerAMGSetDomainType(h_amg, 1);
-            ml = HYPRE_BoomerAMGSetSchwarzRlxWeight(h_amg, rt_val);
-
+            const int problem_dim = m_sim_state->GetMesh()->SpaceDimension();
+            const bool order_bynodes = (fe_space->GetOrdering() == mfem::Ordering::byNODES);
+            // Use MFEM's supported systems-AMG configuration so Hypre sees
+            // the correct vector-valued DOF ordering on newer MFEM/Hypre builds.
+            prec_amg->SetSystemsOptions(problem_dim, order_bynodes);
             prec_amg->SetPrintLevel(linear_solvers.print_level);
             J_prec = prec_amg;
         } else if (linear_solvers.preconditioner == PreconditionerType::ILU) {
@@ -358,10 +348,47 @@ SystemDriver::SystemDriver(std::shared_ptr<SimulationState> sim_state)
     if (nonlinear_solver.nl_solver == NonlinearSolverType::NR) {
         newton_solver = std::make_unique<ExaNewtonSolver>(
             m_sim_state->GetMeshParFiniteElementSpace()->GetComm());
-    } else if (nonlinear_solver.nl_solver == NonlinearSolverType::NRLS) {
+    }
+    else if (nonlinear_solver.nl_solver == NonlinearSolverType::NRLS) {
         newton_solver = std::make_unique<ExaNewtonLSSolver>(
             m_sim_state->GetMeshParFiniteElementSpace()->GetComm());
     }
+    else if (nonlinear_solver.nl_solver == NonlinearSolverType::TRDOG) {
+        // Build the trust-region dogleg solver and configure delta-control
+        // parameters from the parsed TOML options. If the user did not supply
+        // a [trust_region] sub-table, the solver's internal defaults (matching
+        // SNLS's TrDeltaControl defaults) are used.
+        auto tr_solver = std::make_unique<ExaTrustRegionSolver>(
+            m_sim_state->GetMeshParFiniteElementSpace()->GetComm());
+
+        if (nonlinear_solver.trust_region.has_value()) {
+            const auto& tr_opts = nonlinear_solver.trust_region.value();
+            TrDeltaControl ctrl;
+            ctrl.deltaInit         = tr_opts.delta_init;
+            ctrl.deltaMin          = tr_opts.delta_min;
+            ctrl.deltaMax          = tr_opts.delta_max;
+            ctrl.xiLG              = tr_opts.xi_lg;
+            ctrl.xiUG              = tr_opts.xi_ug;
+            ctrl.xiLO              = tr_opts.xi_lo;
+            ctrl.xiUO              = tr_opts.xi_uo;
+            ctrl.xiIncDelta        = tr_opts.xi_inc;
+            ctrl.xiDecDelta        = tr_opts.xi_dec;
+            ctrl.xiForcedIncDelta  = tr_opts.xi_forced_inc;
+            ctrl.rejectResIncrease = tr_opts.reject_increase;
+            tr_solver->SetTrustRegionControl(ctrl);
+        }
+
+        newton_solver = std::move(tr_solver);
+
+        // Sanity check: TRDOG requires gradient transpose support (J^T*r). For
+        // PA mode, this requires the native PA transpose kernels in the
+        // integrator. EA and FULL always support transpose. We warn rather than
+        // hard-fail here because PA support exists once the kernels are wired.
+        if (options.solvers.assembly == AssemblyType::PA) {
+            mfem::out << "Note: TRDOG with PA assembly requires native PA transpose "
+                      << "kernels in the gradient operator.\n";
+        }
+    }
 
     // Set the newton solve parameters
     newton_solver->iterative_mode = true;
@@ -498,7 +525,7 @@ void SystemDriver::UpdateVelocity() {
                                                         // pulled off the
                                                         // VectorFunctionRestrictedCoefficient
         // populate the solution vector, v_sol, with the true dofs entries in v_cur.
-        velocity->GetTrueDofs(*vel_tdofs);
+        GetTrueDofsParallel(*velocity, *vel_tdofs);
     }
 
     if (ess_bdr["ess_vgrad"].Sum() > 0) {
@@ -587,7 +614,7 @@ void SystemDriver::UpdateVelocity() {
             mfem::Vector vel_tdof_tmp(*vel_tdofs);
             vel_tdof_tmp.UseDevice(true);
             vel_tdof_tmp = 0.0;
-            velocity->GetTrueDofs(vel_tdof_tmp);
+            GetTrueDofsParallel(*velocity, vel_tdof_tmp);
 
             mfem::Array<int> ess_tdofs(mech_operator->GetEssentialTrueDofs());
             if (!mono_def_flag) {
diff --git a/src/utilities/mechanics_kernels.hpp b/src/utilities/mechanics_kernels.hpp
index e7d139a..bcb21cf 100644
--- a/src/utilities/mechanics_kernels.hpp
+++ b/src/utilities/mechanics_kernels.hpp
@@ -542,7 +542,7 @@ double ComputeVolAvgTensorFilterFromPartial(const mfem::expt::PartialQuadratureF
 
     // Get the local-to-global element mapping and data layout info
     auto l2g = pqs->GetLocal2Global().Read();    // Maps local element index to global element index
-    auto loc_offsets = pqs->getOffsets().Read(); // Offsets for local data layout
+    auto loc_offsets = pqs->Offsets(mfem::QSpaceOffsetStorage::COMPRESSED).Read(); // Offsets for local data layout
     auto global_offsets = (pqs->GetGlobalOffset().Size() > 1)
                               ? pqs->GetGlobalOffset().Read()
                               : loc_offsets; // Offsets for global data layout
@@ -763,7 +763,7 @@ double ComputeVolAvgTensorFromPartial(const mfem::expt::PartialQuadratureFunctio
 
     // Get the local-to-global element mapping and data layout info
     auto l2g = pqs->GetLocal2Global().Read();    // Maps local element index to global element index
-    auto loc_offsets = pqs->getOffsets().Read(); // Offsets for local data layout
+    auto loc_offsets = pqs->Offsets(mfem::QSpaceOffsetStorage::COMPRESSED).Read(); // Offsets for local data layout
     auto global_offsets = (pqs->GetGlobalOffset().Size() > 1)
                               ? pqs->GetGlobalOffset().Read()
                               : loc_offsets; // Offsets for global data layout