diff --git a/scripts/install/common/dependency_versions.sh b/scripts/install/common/dependency_versions.sh index 86f22fe..94b5180 100644 --- a/scripts/install/common/dependency_versions.sh +++ b/scripts/install/common/dependency_versions.sh @@ -2,25 +2,25 @@ # Central version control for all dependencies # Portability libraries -export CAMP_VER="v2025.09.2" -export RAJA_VER="v2025.09.1" +export CAMP_VER="v2025.12.0" +export RAJA_VER="v2025.12.2" #export UMPIRE_VER="v2025.09.0" # For now we need something a little pass the v2025.09.0 release # for Umpire as we need a small bug fix for any build with Umpire -export UMPIRE_VER="54a1909e91ce9604328977974e9b1002bf9f8781" -export CHAI_VER="v2025.09.1" +export UMPIRE_VER="v2025.12.0" +export CHAI_VER="v2025.12.0" # Material models export EXACMECH_REPO="https://github.com/LLNL/ExaCMech.git" export EXACMECH_BRANCH="develop" # FEM infrastructure -export HYPRE_VER="v2.32.0" +export HYPRE_VER="v3.1.0" export METIS_VER="5.1.0" export METIS_URL="https://mfem.github.io/tpls/metis-${METIS_VER}.tar.gz" export MFEM_REPO="https://github.com/rcarson3/mfem.git" -export MFEM_BRANCH="exaconstit-dev" +export MFEM_BRANCH="exaconstit-latest" # Main application export EXACONSTIT_REPO="https://github.com/llnl/ExaConstit.git" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 24e830a..af8cd1a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,6 +21,7 @@ set(EXACONSTIT_HEADERS postprocessing/mechanics_lightup.hpp sim_state/simulation_state.hpp solvers/mechanics_solver.hpp + solvers/trust_region_solver.hpp utilities/dynamic_function_loader.hpp utilities/mechanics_kernels.hpp utilities/mechanics_log.hpp @@ -59,6 +60,7 @@ set(EXACONSTIT_SOURCES postprocessing/mechanics_lightup.cpp sim_state/simulation_state.cpp solvers/mechanics_solver.cpp + solvers/trust_region_solver.cpp utilities/mechanics_kernels.cpp utilities/unified_logger.cpp ) diff --git a/src/boundary_conditions/BCData.cpp b/src/boundary_conditions/BCData.cpp index 3714bc1..334e650 100644 --- a/src/boundary_conditions/BCData.cpp +++ b/src/boundary_conditions/BCData.cpp @@ -64,7 +64,7 @@ void BCData::SetScales() { } } -void BCData::GetComponents(int id, mfem::Array& component) { +void BCData::GetComponents(int id, std::array& component) { switch (id) { case 0: component[0] = false; diff --git a/src/boundary_conditions/BCData.hpp b/src/boundary_conditions/BCData.hpp index 075e46b..184cb5e 100644 --- a/src/boundary_conditions/BCData.hpp +++ b/src/boundary_conditions/BCData.hpp @@ -5,6 +5,7 @@ #include "mfem.hpp" #include "mfem/linalg/vector.hpp" +#include #include /** @@ -101,6 +102,6 @@ class BCData { * - id = 6: (true, false, true) * - id = 7: (true, true, true) */ - static void GetComponents(int id, mfem::Array& component); + static void GetComponents(int id, std::array& component); }; #endif diff --git a/src/boundary_conditions/BCManager.cpp b/src/boundary_conditions/BCManager.cpp index 5f0e7db..312a685 100644 --- a/src/boundary_conditions/BCManager.cpp +++ b/src/boundary_conditions/BCManager.cpp @@ -13,14 +13,12 @@ void BCManager::UpdateBCData(std::unordered_map>& ess_bdr["total"] = 0; scale = 0.0; - auto ess_comp = map_ess_comp["total"].find(step)->second; - auto ess_id = map_ess_id["total"].find(step)->second; + const auto& ess_comp = map_ess_comp["total"].find(step)->second; + const auto& ess_id = map_ess_id["total"].find(step)->second; - mfem::Array cmp_row; - cmp_row.SetSize(3); + std::array cmp_row; component["total"] = false; - cmp_row = false; for (size_t i = 0; i < ess_id.size(); ++i) { // set the active boundary attributes @@ -48,19 +46,17 @@ void BCManager::UpdateBCData(mfem::Array& ess_bdr, // The size here is set explicitly component.SetSize(ess_bdr.Size(), 3); - mfem::Array cmp_row; - cmp_row.SetSize(3); + std::array cmp_row; component = false; - cmp_row = false; if (map_ess_vel.find(step) == map_ess_vel.end()) { return; } - auto ess_vel = map_ess_vel.find(step)->second; - auto ess_comp = map_ess_comp["ess_vel"].find(step)->second; - auto ess_id = map_ess_id["ess_vel"].find(step)->second; + const auto& ess_vel = map_ess_vel.find(step)->second; + const auto& ess_comp = map_ess_comp["ess_vel"].find(step)->second; + const auto& ess_id = map_ess_id["ess_vel"].find(step)->second; for (size_t i = 0; i < ess_id.size(); ++i) { // set the active boundary attributes @@ -111,19 +107,17 @@ void BCManager::UpdateBCData(mfem::Array& ess_bdr, // The size here is set explicitly component.SetSize(ess_bdr.Size(), 3); - mfem::Array cmp_row; - cmp_row.SetSize(3); + std::array cmp_row; component = false; - cmp_row = false; if (map_ess_vgrad.find(step) == map_ess_vgrad.end()) { return; } - auto ess_vgrad = map_ess_vgrad.find(step)->second; - auto ess_comp = map_ess_comp["ess_vgrad"].find(step)->second; - auto ess_id = map_ess_id["ess_vgrad"].find(step)->second; + const auto& ess_vgrad = map_ess_vgrad.find(step)->second; + const auto& ess_comp = map_ess_comp["ess_vgrad"].find(step)->second; + const auto& ess_id = map_ess_id["ess_vgrad"].find(step)->second; for (size_t i = 0; i < ess_vgrad.size(); ++i) { data[i] = ess_vgrad.at(i); diff --git a/src/fem_operators/mechanics_integrators.cpp b/src/fem_operators/mechanics_integrators.cpp index 9ade98d..2bc9d9d 100644 --- a/src/fem_operators/mechanics_integrators.cpp +++ b/src/fem_operators/mechanics_integrators.cpp @@ -667,6 +667,113 @@ void ExaNLFIntegrator::AddMultGradPA(const mfem::Vector& x, mfem::Vector& y) con } // End of if statement } +// ----------------------------------------------------------------------------- +// ExaNLFIntegrator::AddMultTransposeGradPA +// +// Native PA kernel computing y += K^T * x where K = B^T D B is the standard +// (non-BBar) tangent stiffness. Mirrors AddMultGradPA exactly except for the +// contraction order against the assembled 4th-order tensor D. +// +// Algorithm per element, per quadrature point: +// 1. Compute physical velocity gradient from input vector and shape function +// derivatives: +// Gx(i,k) = sum_a Gt(a,i,qpt) * X(a,k,elem) +// This is the same operation as the forward kernel since B is independent +// of the gradient transposition. +// +// 2. Apply the TRANSPOSED D tensor contraction: +// T(l,n) = sum_{i,k} D(i,k,l,n,qpt,elem) * Gx(i,k) +// whereas the forward kernel does +// T(i,k) = sum_{l,n} D(i,k,l,n,qpt,elem) * Gx(l,n) +// The difference is *which pair* of D's indices are summed against Gx. +// For symmetric C, D has major symmetry D(i,k,l,n) = D(l,n,i,k) and the +// two contractions agree; for non-symmetric C they disagree. +// +// 3. Apply test-function gradients (same operation as forward kernel): +// Y(a,n) += sum_l Gt(a,l,qpt) * T(l,n) +// +// All quadrature weights and Jacobian determinants are baked into D from the +// AssembleGradPA step, so this kernel does not need to reapply them. +// ----------------------------------------------------------------------------- +void ExaNLFIntegrator::AddMultTransposeGradPA(const mfem::Vector &x, + mfem::Vector &y) const +{ + CALI_CXX_MARK_SCOPE("enlfi_amTGPA"); + if ((space_dims == 1) || (space_dims == 2)) { + MFEM_ABORT("Dimensions of 1 or 2 not supported."); + } + else { + const int dim = 3; + const int DIM3 = 3; + const int DIM6 = 6; + + std::array perm3 {{ 2, 1, 0 } }; + std::array perm6 {{ 5, 4, 3, 2, 1, 0 } }; + + // D tensor from AssembleGradPA: D(elem, qpt, i, k, l, n) + // The leading dim being elem matches the ordering used in the forward kernel. + RAJA::Layout layout_tensor = + RAJA::make_permuted_layout({{ dim, dim, dim, dim, nqpts, nelems } }, perm6); + RAJA::View > D(pa_dmat.Read(), + layout_tensor); + + // Field variables: input/output E-vectors + RAJA::Layout layout_field = RAJA::make_permuted_layout({{ nnodes, dim, nelems } }, perm3); + RAJA::View > X(x.Read(), layout_field); + RAJA::View > Y(y.ReadWrite(), layout_field); + + // Reference shape function derivatives: Gt(node, dim, qpt) + RAJA::Layout layout_grads = RAJA::make_permuted_layout({{ nnodes, dim, nqpts } }, perm3); + RAJA::View > Gt(grad.Read(), layout_grads); + + const int nqpts_ = nqpts; + const int dim_ = dim; + const int nnodes_ = nnodes; + + mfem::forall(nelems, [=] MFEM_HOST_DEVICE(int i_elems) { + for (int j_qpts = 0; j_qpts < nqpts_; j_qpts++) { + // Step 1: Compute velocity gradient at this quadrature point + // Gx(i, k) = sum_a Gt(a, i, qpt) * X(a, k, elem) + double Gx[3][3]; + for (int ii = 0; ii < dim_; ii++) { + for (int kk = 0; kk < dim_; kk++) { + Gx[ii][kk] = 0.0; + for (int a = 0; a < nnodes_; a++) { + Gx[ii][kk] += Gt(a, ii, j_qpts) * X(a, kk, i_elems); + } + } + } + + // Step 2: Apply TRANSPOSED D contraction + // T(l, n) = sum_{i,k} D(i, k, l, n, qpt, elem) * Gx(i, k) + // Compare to forward kernel: + // T(i, k) = sum_{l,n} D(i, k, l, n, qpt, elem) * Gx(l, n) + double T[3][3]; + for (int ll = 0; ll < dim_; ll++) { + for (int nn = 0; nn < dim_; nn++) { + T[ll][nn] = 0.0; + for (int ii = 0; ii < dim_; ii++) { + for (int kk = 0; kk < dim_; kk++) { + T[ll][nn] += D(i_elems, j_qpts, ii, kk, ll, nn) * Gx[ii][kk]; + } + } + } + } + + // Step 3: Apply test-function gradients (same as forward kernel) + // Y(a, n) += sum_l Gt(a, l, qpt) * T(l, n) + for (int nn = 0; nn < dim_; nn++) { + for (int ll = 0; ll < dim_; ll++) { + for (int a = 0; a < nnodes_; a++) { + Y(a, nn, i_elems) += Gt(a, ll, j_qpts) * T[ll][nn]; + } + } + } + } // End of nqpts + }); // End of nelems + } // End of else (3D path) +} + // This assembles the diagonal of our LHS which can be used as a preconditioner void ExaNLFIntegrator::AssembleGradDiagonalPA(mfem::Vector& diag) const { CALI_CXX_MARK_SCOPE("enlfi_AssembleGradDiagonalPA"); @@ -1257,6 +1364,70 @@ void ICExaNLFIntegrator::AssembleElementGrad(const mfem::FiniteElement& el, return; } +// ----------------------------------------------------------------------------- +// ICExaNLFIntegrator::AssembleGradPA +// +// Sets up geometric data and ensures element-averaged derivatives are ready. +// The B-bar gradient PA does NOT pre-assemble a D tensor (unlike the base +// class) because the volumetric correction couples element-constant data +// (volume-averaged derivatives N̄) with per-quadrature-point data (C, adj(J)) +// in a way that does not fold cleanly into a single pre-assembled tensor. +// Instead, AddMultGradPA / AddMultTransposeGradPA access C directly from the +// quadrature function and apply the B-bar action on the fly in physical space. +// ----------------------------------------------------------------------------- +void ICExaNLFIntegrator::AssembleGradPA(const mfem::Vector &/* x */, + const mfem::FiniteElementSpace &fes) +{ + this->AssembleGradPA(fes); +} + +void ICExaNLFIntegrator::AssembleGradPA(const mfem::FiniteElementSpace &fes) +{ + CALI_CXX_MARK_SCOPE("icenlfi_assembleGradPA"); + + mfem::Mesh *mesh = fes.GetMesh(); + const mfem::FiniteElement &el = *fes.GetFE(0); + space_dims = el.GetDim(); + const mfem::IntegrationRule *ir = + &(mfem::IntRules.Get(el.GetGeomType(), 2 * el.GetOrder() + 1)); + + nqpts = ir->GetNPoints(); + nnodes = el.GetDof(); + nelems = fes.GetNE(); + + if ((space_dims == 1) || (space_dims == 2)) { + MFEM_ABORT("Dimensions of 1 or 2 not supported."); + } + + // Cache geometric factors (Jacobians at quadrature points) + geom = mesh->GetGeometricFactors(*ir, mfem::GeometricFactors::JACOBIANS); + + // Cache reference shape function derivatives + if (grad.Size() != (nqpts * space_dims * nnodes)) { + grad.SetSize(nqpts * space_dims * nnodes, mfem::Device::GetMemoryType()); + { + mfem::DenseMatrix DSh; + const int offset = nnodes * space_dims; + double *qpts_dshape_data = grad.HostReadWrite(); + for (int i = 0; i < nqpts; i++) { + const mfem::IntegrationPoint &ip = ir->IntPoint(i); + DSh.UseExternalData(&qpts_dshape_data[offset * i], nnodes, space_dims); + el.CalcDShape(ip, DSh); + } + } + grad.UseDevice(true); + } + + // Element-averaged derivatives N̄(a, k, elem) are computed by AssemblePA(). + // If they have not been computed yet, force a call now so the gradient PA + // kernels can use them. The AssemblePA path is idempotent and safe to call + // even if it has been called previously (it re-zeroes and recomputes). + if (elem_deriv_shapes.Size() != (nnodes * space_dims * nelems)) { + this->AssemblePA(fes); + } +} + + /// Method defining element assembly. /** The result of the element assembly is added and stored in the @a emat Vector. */ @@ -1265,6 +1436,7 @@ void ICExaNLFIntegrator::AssembleGradEA(const mfem::Vector& /*x*/, mfem::Vector& emat) { AssembleEA(fes, emat); } + void ICExaNLFIntegrator::AssembleEA(const mfem::FiniteElementSpace& fes, mfem::Vector& emat) { CALI_CXX_MARK_SCOPE("icenlfi_assembleEA"); const mfem::FiniteElement& el = *fes.GetFE(0); @@ -2014,6 +2186,377 @@ void ICExaNLFIntegrator::AssemblePA(const mfem::FiniteElementSpace& fes) { } // End of space dims if else } +// ----------------------------------------------------------------------------- +// ICExaNLFIntegrator::AddMultGradPA +// +// Native B-bar tangent stiffness PA action: y += K̄ * x where +// K̄ = ∫ B̄^T C B̄ dΩ +// and B̄ is the B-bar strain-displacement matrix from Hughes (1980). +// +// Because B̄ couples element-constant volume-averaged data with per-qpt data, +// we work in physical space and access C directly from the simulation state's +// tangent stiffness quadrature function. +// +// Algorithm per element, per quadrature point (q): +// 1. Hoist tr_bar (element-constant) outside the qpt loop: +// tr_bar = sum_{a,k} N̄(a,k) * V(a,k) +// This is the volume-averaged trace of the velocity gradient that B̄ +// uses in place of the per-qpt trace. +// +// 2. Compute the adjugate matrix and Jacobian determinant from the cached +// Jacobian. Adjugate is used to transform reference derivatives Gt to +// physical derivatives: +// dN(a,j) = (1/detJ) * sum_k Gt(a,k,q) * adj(j,k) +// (Adjugate uses inverse-transpose convention; same as in the standard +// ExaNLFIntegrator AssembleGradPA kernel.) +// +// 3. Compute physical velocity gradient: +// L(i,j) = sum_a dN(a,j) * V(a,i) +// +// 4. Compute B-bar trace correction: +// Δtr = (tr_bar - tr(L)) / 3 +// and modified velocity gradient: +// L̄(i,j) = L(i,j) + δ_ij * Δtr +// which replaces the volumetric trace of L with tr_bar (Hughes' B-bar). +// +// 5. Apply material tangent (forward direction): +// σ'(j,k) = sum_{l,m} C(j,k,l,m) * L̄(l,m) +// C is fetched on the fly from the tangent_stiffness quadrature function. +// +// 6. Compute pressure (volumetric) part of σ': +// p' = (1/3) * tr(σ') +// +// 7. Accumulate into Y with B-bar test side. The test side replaces the +// pressure contribution to nodal forces using the volume-averaged +// derivatives N̄ in place of the per-qpt dN: +// Y(a,k) += [sum_j dN(a,j) σ'(j,k) + (N̄(a,k) - dN(a,k)) p'] * w * detJ +// The first term is the standard B^T σ' force, the second redirects the +// pressure piece through N̄. +// +// Verification properties: +// - For symmetric C, the result must equal the forward action of any +// symmetric formulation (B̄^T C B̄ is symmetric). +// - For a uniform-Jacobian mesh where tr_bar agrees with the per-qpt +// trace, Δtr → 0 at every qpt and the result must match the standard +// (non-B-bar) result. +// ----------------------------------------------------------------------------- +void ICExaNLFIntegrator::AddMultGradPA(const mfem::Vector &x, + mfem::Vector &y) const +{ + CALI_CXX_MARK_SCOPE("icenlfi_amGPA"); + if ((space_dims == 1) || (space_dims == 2)) { + MFEM_ABORT("Dimensions of 1 or 2 not supported."); + } + else { + const int dim = 3; + const int DIM3 = 3; + const int DIM4 = 4; + const int DIM6 = 6; + + std::array perm3 {{ 2, 1, 0 } }; + std::array perm4 {{ 3, 2, 1, 0 } }; + std::array perm6 {{ 5, 4, 3, 2, 1, 0 } }; + + // Input / output E-vectors + RAJA::Layout layout_field = RAJA::make_permuted_layout({{ nnodes, dim, nelems } }, perm3); + RAJA::View > X(x.Read(), layout_field); + RAJA::View > Y(y.ReadWrite(), layout_field); + + // Reference shape function derivatives Gt(node, dim, qpt) + RAJA::Layout layout_grads = RAJA::make_permuted_layout({{ nnodes, dim, nqpts } }, perm3); + RAJA::View > Gt(grad.Read(), layout_grads); + + // Element-averaged derivatives N̄(node, dim, elem) + RAJA::View > Nbar(elem_deriv_shapes.Read(), + layout_field); + + // Mesh Jacobians J(dim, dim, qpt, elem) — column-major mfem convention + RAJA::Layout layout_jac = RAJA::make_permuted_layout({{ dim, dim, nqpts, nelems } }, perm4); + RAJA::View > J_data(geom->J.Read(), layout_jac); + + // Material tangent C(j, k, l, m, qpt, elem) from quadrature function + auto tangent_qf = m_sim_state->GetQuadratureFunction("tangent_stiffness"); + RAJA::Layout layout_C = RAJA::make_permuted_layout( + {{ dim, dim, dim, dim, nqpts, nelems } }, perm6); + RAJA::View > C(tangent_qf->Read(), layout_C); + + // Integration weights from the tangent stiffness QF integration rule + const mfem::IntegrationRule &ir = + tangent_qf->GetSpace()->GetIntRule(0); + auto W = ir.GetWeights().Read(); + + const int nqpts_ = nqpts; + const int dim_ = dim; + const int nnodes_ = nnodes; + + mfem::forall(nelems, [=] MFEM_HOST_DEVICE(int e) { + // Step 1: Hoist tr_bar outside the qpt loop (element-constant) + double tr_bar = 0.0; + for (int a = 0; a < nnodes_; a++) { + for (int k = 0; k < dim_; k++) { + tr_bar += Nbar(a, k, e) * X(a, k, e); + } + } + + for (int q = 0; q < nqpts_; q++) { + // Step 2: Compute adjugate and Jacobian determinant + const double J11 = J_data(0, 0, q, e), J12 = J_data(1, 0, q, e), + J13 = J_data(2, 0, q, e); + const double J21 = J_data(0, 1, q, e), J22 = J_data(1, 1, q, e), + J23 = J_data(2, 1, q, e); + const double J31 = J_data(0, 2, q, e), J32 = J_data(1, 2, q, e), + J33 = J_data(2, 2, q, e); + + double adj[9]; + adj[0] = (J22 * J33) - (J23 * J32); // 0,0 + adj[1] = (J23 * J31) - (J21 * J33); // 0,1 + adj[2] = (J21 * J32) - (J22 * J31); // 0,2 + adj[3] = (J13 * J32) - (J12 * J33); // 1,0 + adj[4] = (J11 * J33) - (J13 * J31); // 1,1 + adj[5] = (J12 * J31) - (J11 * J32); // 1,2 + adj[6] = (J12 * J23) - (J13 * J22); // 2,0 + adj[7] = (J13 * J21) - (J11 * J23); // 2,1 + adj[8] = (J11 * J22) - (J12 * J21); // 2,2 + + const double detJ = J11 * adj[0] + J21 * adj[3] + J31 * adj[6]; + const double idetJ = 1.0 / detJ; + const double w_detJ = W[q] * detJ; + + // Step 3: Physical velocity gradient L(i,j) = sum_a dN(a,j) * V(a,i) + // We compute dN(a, :) on-the-fly from Gt and adj. + double L[3][3] = {{ 0.0 } }; + for (int a = 0; a < nnodes_; a++) { + double dNa[3]; + for (int j = 0; j < dim_; j++) { + dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] + + Gt(a, 1, q) * adj[j * 3 + 1] + + Gt(a, 2, q) * adj[j * 3 + 2]); + } + for (int i = 0; i < dim_; i++) { + for (int j = 0; j < dim_; j++) { + L[i][j] += dNa[j] * X(a, i, e); + } + } + } + + // Step 4: B-bar trace correction + const double tr_std = L[0][0] + L[1][1] + L[2][2]; + const double dtr = (tr_bar - tr_std) / 3.0; + + double Lbar[3][3]; + for (int i = 0; i < dim_; i++) { + for (int j = 0; j < dim_; j++) { + Lbar[i][j] = L[i][j]; + } + } + Lbar[0][0] += dtr; + Lbar[1][1] += dtr; + Lbar[2][2] += dtr; + + // Step 5: Apply material tangent — forward contraction + // σ'(j, k) = sum_{l,m} C(j, k, l, m) * L̄(l, m) + double sigma[3][3] = {{ 0.0 } }; + for (int j = 0; j < dim_; j++) { + for (int k = 0; k < dim_; k++) { + for (int l = 0; l < dim_; l++) { + for (int m = 0; m < dim_; m++) { + sigma[j][k] += C(j, k, l, m, q, e) * Lbar[l][m]; + } + } + } + } + + // Step 6: Pressure (volumetric) part of σ' + const double p = (sigma[0][0] + sigma[1][1] + sigma[2][2]) / 3.0; + + // Step 7: Accumulate forces with B-bar test side + // Y(a, k) += [sum_j dN(a,j) σ'(j,k) + (N̄(a,k) - dN(a,k)) p] * w * detJ + for (int a = 0; a < nnodes_; a++) { + double dNa[3]; + for (int j = 0; j < dim_; j++) { + dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] + + Gt(a, 1, q) * adj[j * 3 + 1] + + Gt(a, 2, q) * adj[j * 3 + 2]); + } + for (int k = 0; k < dim_; k++) { + double f_std = 0.0; + for (int j = 0; j < dim_; j++) { + f_std += dNa[j] * sigma[j][k]; + } + double f_bbar = (Nbar(a, k, e) - dNa[k]) * p; + Y(a, k, e) += (f_std + f_bbar) * w_detJ; + } + } + } // End of qpts + }); // End of nelems + } // End of else (3D path) +} + + +// ----------------------------------------------------------------------------- +// ICExaNLFIntegrator::AddMultTransposeGradPA +// +// Native transposed B-bar tangent stiffness PA action: y += K̄^T * x. +// +// This is structurally IDENTICAL to AddMultGradPA except for one line: the +// material tangent contraction uses C(l,m,j,k) instead of C(j,k,l,m). The +// B-bar geometry (N̄, dN, trace correction, pressure redirection) is the +// same on both sides of K̄ = B̄^T C B̄ because: +// (B̄^T C B̄)^T = B̄^T C^T B̄ +// — only the middle factor C transposes; the outer B̄^T and B̄ remain in +// place. +// +// For symmetric C, this kernel produces results identical to AddMultGradPA +// (a useful verification check). For non-symmetric C (crystal plasticity +// with non-associated flow or non-symmetric Schmid coupling) it produces +// genuinely different results, as required for correct trust-region +// Cauchy point computation. +// ----------------------------------------------------------------------------- +void ICExaNLFIntegrator::AddMultTransposeGradPA(const mfem::Vector &x, + mfem::Vector &y) const +{ + CALI_CXX_MARK_SCOPE("icenlfi_amTGPA"); + if ((space_dims == 1) || (space_dims == 2)) { + MFEM_ABORT("Dimensions of 1 or 2 not supported."); + } + else { + const int dim = 3; + const int DIM3 = 3; + const int DIM4 = 4; + const int DIM6 = 6; + + std::array perm3 {{ 2, 1, 0 } }; + std::array perm4 {{ 3, 2, 1, 0 } }; + std::array perm6 {{ 5, 4, 3, 2, 1, 0 } }; + + RAJA::Layout layout_field = RAJA::make_permuted_layout({{ nnodes, dim, nelems } }, perm3); + RAJA::View > X(x.Read(), layout_field); + RAJA::View > Y(y.ReadWrite(), layout_field); + + RAJA::Layout layout_grads = RAJA::make_permuted_layout({{ nnodes, dim, nqpts } }, perm3); + RAJA::View > Gt(grad.Read(), layout_grads); + + RAJA::View > Nbar(elem_deriv_shapes.Read(), + layout_field); + + RAJA::Layout layout_jac = RAJA::make_permuted_layout({{ dim, dim, nqpts, nelems } }, perm4); + RAJA::View > J_data(geom->J.Read(), layout_jac); + + auto tangent_qf = m_sim_state->GetQuadratureFunction("tangent_stiffness"); + RAJA::Layout layout_C = RAJA::make_permuted_layout( + {{ dim, dim, dim, dim, nqpts, nelems } }, perm6); + RAJA::View > C(tangent_qf->Read(), layout_C); + + const mfem::IntegrationRule &ir = + tangent_qf->GetSpace()->GetIntRule(0); + auto W = ir.GetWeights().Read(); + + const int nqpts_ = nqpts; + const int dim_ = dim; + const int nnodes_ = nnodes; + + mfem::forall(nelems, [=] MFEM_HOST_DEVICE(int e) { + // Step 1: Hoist tr_bar (element-constant) + double tr_bar = 0.0; + for (int a = 0; a < nnodes_; a++) { + for (int k = 0; k < dim_; k++) { + tr_bar += Nbar(a, k, e) * X(a, k, e); + } + } + + for (int q = 0; q < nqpts_; q++) { + // Step 2: Adjugate and Jacobian determinant + const double J11 = J_data(0, 0, q, e), J12 = J_data(1, 0, q, e), + J13 = J_data(2, 0, q, e); + const double J21 = J_data(0, 1, q, e), J22 = J_data(1, 1, q, e), + J23 = J_data(2, 1, q, e); + const double J31 = J_data(0, 2, q, e), J32 = J_data(1, 2, q, e), + J33 = J_data(2, 2, q, e); + + double adj[9]; + adj[0] = (J22 * J33) - (J23 * J32); + adj[1] = (J23 * J31) - (J21 * J33); + adj[2] = (J21 * J32) - (J22 * J31); + adj[3] = (J13 * J32) - (J12 * J33); + adj[4] = (J11 * J33) - (J13 * J31); + adj[5] = (J12 * J31) - (J11 * J32); + adj[6] = (J12 * J23) - (J13 * J22); + adj[7] = (J13 * J21) - (J11 * J23); + adj[8] = (J11 * J22) - (J12 * J21); + + const double detJ = J11 * adj[0] + J21 * adj[3] + J31 * adj[6]; + const double idetJ = 1.0 / detJ; + const double w_detJ = W[q] * detJ; + + // Step 3: Physical velocity gradient + double L[3][3] = {{ 0.0 } }; + for (int a = 0; a < nnodes_; a++) { + double dNa[3]; + for (int j = 0; j < dim_; j++) { + dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] + + Gt(a, 1, q) * adj[j * 3 + 1] + + Gt(a, 2, q) * adj[j * 3 + 2]); + } + for (int i = 0; i < dim_; i++) { + for (int j = 0; j < dim_; j++) { + L[i][j] += dNa[j] * X(a, i, e); + } + } + } + + // Step 4: B-bar trace correction + const double tr_std = L[0][0] + L[1][1] + L[2][2]; + const double dtr = (tr_bar - tr_std) / 3.0; + + double Lbar[3][3]; + for (int i = 0; i < dim_; i++) { + for (int j = 0; j < dim_; j++) { + Lbar[i][j] = L[i][j]; + } + } + Lbar[0][0] += dtr; + Lbar[1][1] += dtr; + Lbar[2][2] += dtr; + + // Step 5: TRANSPOSED material tangent contraction + // σ'(j, k) = sum_{l,m} C(l, m, j, k) * L̄(l, m) + // (Compare to forward: C(j, k, l, m) * L̄(l, m)) + double sigma[3][3] = {{ 0.0 } }; + for (int j = 0; j < dim_; j++) { + for (int k = 0; k < dim_; k++) { + for (int l = 0; l < dim_; l++) { + for (int m = 0; m < dim_; m++) { + sigma[j][k] += C(l, m, j, k, q, e) * Lbar[l][m]; + } + } + } + } + + // Step 6: Pressure + const double p = (sigma[0][0] + sigma[1][1] + sigma[2][2]) / 3.0; + + // Step 7: Accumulate with B-bar test side (same as forward kernel) + for (int a = 0; a < nnodes_; a++) { + double dNa[3]; + for (int j = 0; j < dim_; j++) { + dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] + + Gt(a, 1, q) * adj[j * 3 + 1] + + Gt(a, 2, q) * adj[j * 3 + 2]); + } + for (int k = 0; k < dim_; k++) { + double f_std = 0.0; + for (int j = 0; j < dim_; j++) { + f_std += dNa[j] * sigma[j][k]; + } + double f_bbar = (Nbar(a, k, e) - dNa[k]) * p; + Y(a, k, e) += (f_std + f_bbar) * w_detJ; + } + } + } // End of qpts + }); // End of nelems + } // End of else (3D path) +} + // Here we're applying the following action operation using the assembled "D" 2nd order // tensor found above: // y_{ik} = \nabla_{ij}\phi^T_{\epsilon} D_{jk} diff --git a/src/fem_operators/mechanics_integrators.hpp b/src/fem_operators/mechanics_integrators.hpp index fb7d4f7..0a761ec 100644 --- a/src/fem_operators/mechanics_integrators.hpp +++ b/src/fem_operators/mechanics_integrators.hpp @@ -351,6 +351,35 @@ class ExaNLFIntegrator : public mfem::NonlinearFormIntegrator { */ virtual void AddMultGradPA(const mfem::Vector& x, mfem::Vector& y) const override; + /** + * @brief Apply transposed gradient action via partial assembly. + * + * @param x Input vector for transposed Jacobian-vector product + * @param y Output vector for accumulated result + * + * Native PA kernel computing y += K^T * x where K = B^T D B is the + * tangent stiffness operator. The only computational difference from + * AddMultGradPA is the contraction order with the assembled 4th-order + * tensor D: + * + * Forward (AddMultGradPA): + * T(i,k) = D(i,k,l,n,qpt,elem) * Gx(l,n) — contract last pair + * Y(a,k) += Gt(a,i,qpt) * T(i,k) + * + * Transpose (this method): + * T(l,n) = D(i,k,l,n,qpt,elem) * Gx(i,k) — contract first pair + * Y(a,n) += Gt(a,l,qpt) * T(l,n) + * + * For symmetric material tangent C, the two operations are identical. + * For non-symmetric C (crystal plasticity), they differ. The transpose + * is required for trust-region dogleg solver Cauchy point computation + * where the merit function gradient is g = J^T * r, not J * r. + * + * @note GPU-compatible via mfem::forall + * @note Requires prior AssembleGradPA() call for the D tensor + */ + virtual void AddMultTransposeGradPA(const mfem::Vector &x, mfem::Vector &y) const override; + using mfem::NonlinearFormIntegrator::AssemblePA; /** * @brief Initialize partial assembly data structures for residual operations. @@ -723,10 +752,82 @@ class ICExaNLFIntegrator : public ExaNLFIntegrator { const mfem::Vector& /*elfun*/, mfem::DenseMatrix& elmat) override; - // This method doesn't easily extend to PA formulation, so we're punting on - // it for now. - using ExaNLFIntegrator::AddMultGradPA; - using ExaNLFIntegrator::AssembleGradPA; + /** + * @brief Initialize partial assembly data structures for B-bar gradient operations. + * + * @param fes Finite element space providing mesh and element information + * + * Sets up the geometric data needed by AddMultGradPA() and + * AddMultTransposeGradPA() for the B-bar tangent stiffness operator. + * + * Unlike the base class AssembleGradPA() which pre-assembles a 4th-order + * tensor D, the B-bar version stores only the geometric data (Jacobians, + * reference shape function derivatives, and element-averaged derivatives) + * and applies the material tangent C on-the-fly inside the kernel. This + * is because the B-bar correction couples element-constant data (the + * volume-averaged derivatives) with quadrature-point-local data (C and + * adj(J)) in a way that doesn't fold cleanly into a single pre-assembled + * tensor. + * + * Setup steps: + * 1. Cache space_dims, nqpts, nnodes, nelems from the FES + * 2. Get geometric factors (Jacobians at quadrature points) from the mesh + * 3. Compute and cache reference shape function derivatives Gt(a, k, qpt) + * 4. Ensure element-averaged derivatives N̄(a, k, elem) are available + * (calling AssemblePA() if not yet computed) + * + * @note Must be called before AddMultGradPA() or AddMultTransposeGradPA() + * @note Material tangent C is accessed directly from the simulation state + * quadrature function during the AddMult kernels + */ + virtual void AssembleGradPA(const mfem::FiniteElementSpace &fes) override; + + /// State-ful overload that ignores the state vector @a x. + virtual void AssembleGradPA(const mfem::Vector &x, const mfem::FiniteElementSpace &fes) override; + + /** + * @brief Apply partial-assembly B-bar tangent stiffness action. + * + * @param x Input E-vector (nodal velocities) + * @param y Output E-vector (accumulated) + * + * Computes y += K̄ * x where K̄ = ∫ B̄^T C B̄ dΩ is the B-bar tangent. + * + * Algorithm per element, per quadrature point: + * 1. Compute adj(J) and detJ from the cached Jacobian + * 2. Compute physical derivatives dN(a,j) on-the-fly from Gt and adj(J) + * 3. Compute physical velocity gradient L(i,j) = dN(a,j) V(a,i) + * 4. Compute B-bar trace correction Δtr = (tr_bar - tr(L)) / 3 + * where tr_bar = N̄(a,k) V(a,k) is element-constant (hoisted) + * 5. Modified velocity gradient L̄ = L + δ_ij * Δtr + * 6. Apply C: σ'(j,k) = C(j,k,l,m) * L̄(l,m) + * 7. Pressure correction p' = (1/3) tr(σ') + * 8. Accumulate into Y: standard force + B-bar pressure redirection + * Y(a,k) += [Σ_j dN(a,j) σ'(j,k) + (N̄(a,k) - dN(a,k)) p'] * w * detJ + * + * @note GPU-compatible via mfem::forall + * @note Requires prior AssembleGradPA() call + */ + virtual void AddMultGradPA(const mfem::Vector &x, mfem::Vector &y) const override; + + /** + * @brief Apply transposed B-bar tangent stiffness action. + * + * @param x Input E-vector + * @param y Output E-vector (accumulated) + * + * Computes y += K̄^T * x. Identical to AddMultGradPA except the C + * contraction order is swapped: + * Forward: σ'(j,k) = C(j,k,l,m) * L̄(l,m) + * Transpose: σ'(j,k) = C(l,m,j,k) * L̄(l,m) + * + * The B-bar geometry (N̄, dN, trace correction, pressure redirection) + * is identical for both directions because B̄ appears on both the + * trial and test sides of K̄ = B̄^T C B̄, and (B̄^T C B̄)^T = B̄^T C^T B̄. + * + * @note For symmetric C, this produces identical results to AddMultGradPA + */ + virtual void AddMultTransposeGradPA(const mfem::Vector &x, mfem::Vector &y) const override; /** * @brief Initialize partial assembly data structures for B-bar residual operations. diff --git a/src/fem_operators/mechanics_operator.cpp b/src/fem_operators/mechanics_operator.cpp index b95cd74..93b1ebe 100644 --- a/src/fem_operators/mechanics_operator.cpp +++ b/src/fem_operators/mechanics_operator.cpp @@ -13,6 +13,15 @@ #include #include +namespace { +void GetTrueDofsParallel(const mfem::ParGridFunction& gf, mfem::Vector& true_dofs) { + // used to do something like: + // gf.GetTrueDofs(true_dofs); + // but looks like there are issues with that on the GPUs with newer versions of MFEM + gf.ParallelAverage(true_dofs); +} +} // namespace + NonlinearMechOperator::NonlinearMechOperator(mfem::Array& ess_bdr, mfem::Array2D& ess_bdr_comp, std::shared_ptr sim_state) @@ -259,7 +268,7 @@ void NonlinearMechOperator::CalculateDeformationGradient(mfem::QuadratureFunctio mfem::Vector x_true(fe_space->TrueVSize(), mfem::Device::GetMemoryType()); - x_cur->GetTrueDofs(x_true); + GetTrueDofsParallel(*x_cur, x_true); // Takes in k vector and transforms into into our E-vector array P->Mult(x_true, px); elem_restrict_lex->Mult(px, el_x); diff --git a/src/mechanics_driver.cpp b/src/mechanics_driver.cpp index 0e9520e..4f3efca 100644 --- a/src/mechanics_driver.cpp +++ b/src/mechanics_driver.cpp @@ -211,7 +211,17 @@ int main(int argc, char* argv[]) { */ mfem::Device device; if (toml_opt.solvers.rtmodel == RTModel::GPU) { +#if defined(MFEM_USE_UMPIRE) + device.SetMemoryTypes(mfem::MemoryType::HOST_UMPIRE, mfem::MemoryType::DEVICE_UMPIRE); +#else device.SetMemoryTypes(mfem::MemoryType::HOST_64, mfem::MemoryType::DEVICE); +#endif + } else { +#if defined(MFEM_USE_UMPIRE) + device.SetMemoryTypes(mfem::MemoryType::HOST_UMPIRE, mfem::MemoryType::DEVICE_UMPIRE); +#else + device.SetMemoryTypes(mfem::MemoryType::HOST_64, mfem::MemoryType::DEVICE); +#endif } device.Configure(device_config.c_str()); diff --git a/src/mfem_expt/partial_qspace.cpp b/src/mfem_expt/partial_qspace.cpp index 2e0261f..3230313 100644 --- a/src/mfem_expt/partial_qspace.cpp +++ b/src/mfem_expt/partial_qspace.cpp @@ -43,6 +43,8 @@ const mfem::Vector& PartialQuadratureSpace::GetGeometricFactorWeights() const { void PartialQuadratureSpace::ConstructOffsets() { // Set up offsets based on our partial element set const int num_partial_elem = local2global.Size(); + ne = num_partial_elem; + full_offset_cache.SetSize(0); offsets.SetSize(num_partial_elem + 1); int offset = 0; for (int i = 0; i < num_partial_elem; i++) { diff --git a/src/options/option_enum.cpp b/src/options/option_enum.cpp index 6ae4b99..8749ad7 100644 --- a/src/options/option_enum.cpp +++ b/src/options/option_enum.cpp @@ -106,12 +106,15 @@ LinearSolverType string_to_linear_solver_type(const std::string& str) { /** * @brief Convert string to NonlinearSolverType enum - * @param str String representation of nonlinear solver type ("NR", "NRLS") + * @param str String representation of nonlinear solver type ("NR", "NRLS", "TRDOG") * @return Corresponding NonlinearSolverType enum value */ NonlinearSolverType string_to_nonlinear_solver_type(const std::string& str) { static const std::map mapping = { - {"NR", NonlinearSolverType::NR}, {"NRLS", NonlinearSolverType::NRLS}}; + {"NR", NonlinearSolverType::NR}, + {"NRLS", NonlinearSolverType::NRLS}, + {"TRDOG", NonlinearSolverType::TRDOG} + }; return string_to_enum(str, mapping, NonlinearSolverType::NOTYPE, "nonlinear solver"); } diff --git a/src/options/option_parser_v2.cpp b/src/options/option_parser_v2.cpp index efac46e..cb29e98 100644 --- a/src/options/option_parser_v2.cpp +++ b/src/options/option_parser_v2.cpp @@ -790,6 +790,9 @@ void ExaOptions::print_solver_options() const { case NonlinearSolverType::NRLS: std::cout << "Newton-Raphson with line search\n"; break; + case NonlinearSolverType::TRDOG: + std::cout << "Trust-region dogleg (SNLS port)\n"; + break; default: std::cout << "Unknown\n"; break; @@ -798,6 +801,43 @@ void ExaOptions::print_solver_options() const { std::cout << " Maximum iterations: " << solvers.nonlinear_solver.iter << "\n"; std::cout << " Relative tolerance: " << solvers.nonlinear_solver.rel_tol << "\n"; std::cout << " Absolute tolerance: " << solvers.nonlinear_solver.abs_tol << "\n"; + + // Trust-region parameters: print if either the solver is TRDOG or the user + // supplied a [trust_region] sub-table. The latter case is informational — + // it lets the user spot misconfigurations where they set TR options without + // selecting the TRDOG solver. + const bool is_trdog = (solvers.nonlinear_solver.nl_solver == NonlinearSolverType::TRDOG); + const bool tr_supplied = solvers.nonlinear_solver.trust_region.has_value(); + + if (is_trdog || tr_supplied) { + std::cout << "\n Trust-region parameters"; + if (is_trdog && !tr_supplied) { + std::cout << " (using defaults)"; + } + else if (!is_trdog && tr_supplied) { + std::cout << " (WARNING: supplied but solver is not TRDOG)"; + } + std::cout << ":\n"; + + // Use the supplied options if present, otherwise default-construct + // a TrustRegionOptions to print the defaults + const TrustRegionOptions tr_opts = tr_supplied + ? solvers.nonlinear_solver.trust_region.value() + : TrustRegionOptions{}; + + std::cout << " delta_init = " << tr_opts.delta_init << "\n"; + std::cout << " delta_min = " << tr_opts.delta_min << "\n"; + std::cout << " delta_max = " << tr_opts.delta_max << "\n"; + std::cout << " xi_lg = " << tr_opts.xi_lg << "\n"; + std::cout << " xi_ug = " << tr_opts.xi_ug << "\n"; + std::cout << " xi_lo = " << tr_opts.xi_lo << "\n"; + std::cout << " xi_uo = " << tr_opts.xi_uo << "\n"; + std::cout << " xi_inc = " << tr_opts.xi_inc << "\n"; + std::cout << " xi_dec = " << tr_opts.xi_dec << "\n"; + std::cout << " xi_forced_inc = " << tr_opts.xi_forced_inc << "\n"; + std::cout << " reject_increase = " + << (tr_opts.reject_increase ? "true" : "false") << "\n"; + } } void ExaOptions::print_material_options() const { diff --git a/src/options/option_parser_v2.hpp b/src/options/option_parser_v2.hpp index d38fac7..9f77d13 100644 --- a/src/options/option_parser_v2.hpp +++ b/src/options/option_parser_v2.hpp @@ -97,9 +97,10 @@ enum class LinearSolverType { * @brief Enumeration for nonlinear solver types */ enum class NonlinearSolverType { - NR, /**< Newton-Raphson method */ - NRLS, /**< Newton-Raphson with line search */ - NOTYPE /**< Uninitialized or invalid nonlinear solver type */ + NR, /**< Newton-Raphson method */ + NRLS, /**< Newton-Raphson with line search */ + TRDOG, /**< Trust-region dogleg method (ported from SNLS) */ + NOTYPE /**< Uninitialized or invalid nonlinear solver type */ }; /** @@ -623,6 +624,103 @@ struct LinearSolverOptions { static LinearSolverOptions from_toml(const toml::value& toml_input); }; +/** + * @brief Trust-region dogleg solver configuration + * + * @details Controls the trust-region radius management and dogleg step + * computation for the ExaTrustRegionSolver. Parameters are ported from + * SNLS's TrDeltaControl with sane defaults suitable for solid mechanics + * applications. Power users can tune these for difficult crystal plasticity + * problems. + * + * The trust-region radius delta is updated based on the ratio + * rho = actual_residual_change / predicted_residual_change + * where predicted change comes from the linearized model at the current iterate. + * + * Acceptance/rejection bands: + * - "Good" band [xi_lg, xi_ug]: increase delta when rho falls here + * - "OK" band [xi_lo, xi_uo]: keep delta when rho falls here (outside good) + * - Outside [xi_lo, xi_uo]: decrease delta + * + * TOML configuration example: + * @code + * [Solvers.NR.trust_region] + * delta_init = 1.0 + * delta_min = 1e-12 + * delta_max = 1e4 + * xi_lg = 0.75 + * xi_ug = 1.4 + * xi_lo = 0.35 + * xi_uo = 5.0 + * xi_inc = 1.5 + * xi_dec = 0.25 + * xi_forced_inc = 1.2 + * reject_increase = true + * @endcode + */ +struct TrustRegionOptions { + /** + * @brief Initial trust-region radius + */ + double delta_init = 1.0; + + /** + * @brief Minimum allowed trust-region radius. Solver fails if delta drops below this. + */ + double delta_min = 1e-12; + + /** + * @brief Maximum allowed trust-region radius + */ + double delta_max = 1e4; + + /** + * @brief Lower bound of the "good" rho band (increase delta when rho > xi_lg) + */ + double xi_lg = 0.75; + + /** + * @brief Upper bound of the "good" rho band + */ + double xi_ug = 1.4; + + /** + * @brief Lower bound of the "ok" rho band (decrease delta when rho < xi_lo) + */ + double xi_lo = 0.35; + + /** + * @brief Upper bound of the "ok" rho band (decrease delta when rho > xi_uo) + */ + double xi_uo = 5.0; + + /** + * @brief Factor used to increase delta when a step is accepted in the "good" band + */ + double xi_inc = 1.5; + + /** + * @brief Factor used to decrease delta when a step quality is outside the "ok" band + */ + double xi_dec = 0.25; + + /** + * @brief Forced-increase factor when the predicted residual change is exactly zero + */ + double xi_forced_inc = 1.2; + + /** + * @brief Whether to reject steps that increase the residual norm + */ + bool reject_increase = true; + + // Validation + bool validate() const; + + // Conversion from toml + static TrustRegionOptions from_toml(const toml::value& toml_input); +}; + /** * @brief Nonlinear solver configuration */ @@ -647,6 +745,14 @@ struct NonlinearSolverOptions { */ NonlinearSolverType nl_solver = NonlinearSolverType::NR; + /** + * @brief Trust-region configuration (only used when nl_solver == TRDOG). + * + * If left empty, default TrustRegionOptions values are used. Users with + * difficult convergence problems should provide custom values. + */ + std::optional trust_region; + // Validation bool validate() const; diff --git a/src/options/option_solvers.cpp b/src/options/option_solvers.cpp index b5f8af7..817b64c 100644 --- a/src/options/option_solvers.cpp +++ b/src/options/option_solvers.cpp @@ -39,6 +39,63 @@ LinearSolverOptions LinearSolverOptions::from_toml(const toml::value& toml_input return options; } +/** + * @brief Parse trust-region options from a TOML sub-table. + * + * Each field is optional — if not present in the TOML, the struct's default + * value is preserved. This lets users override only the parameters they need + * to tune. + */ +TrustRegionOptions TrustRegionOptions::from_toml(const toml::value& toml_input) { + TrustRegionOptions options; + + if (toml_input.contains("delta_init")) { + options.delta_init = toml::find(toml_input, "delta_init"); + } + + if (toml_input.contains("delta_min")) { + options.delta_min = toml::find(toml_input, "delta_min"); + } + + if (toml_input.contains("delta_max")) { + options.delta_max = toml::find(toml_input, "delta_max"); + } + + if (toml_input.contains("xi_lg")) { + options.xi_lg = toml::find(toml_input, "xi_lg"); + } + + if (toml_input.contains("xi_ug")) { + options.xi_ug = toml::find(toml_input, "xi_ug"); + } + + if (toml_input.contains("xi_lo")) { + options.xi_lo = toml::find(toml_input, "xi_lo"); + } + + if (toml_input.contains("xi_uo")) { + options.xi_uo = toml::find(toml_input, "xi_uo"); + } + + if (toml_input.contains("xi_inc")) { + options.xi_inc = toml::find(toml_input, "xi_inc"); + } + + if (toml_input.contains("xi_dec")) { + options.xi_dec = toml::find(toml_input, "xi_dec"); + } + + if (toml_input.contains("xi_forced_inc")) { + options.xi_forced_inc = toml::find(toml_input, "xi_forced_inc"); + } + + if (toml_input.contains("reject_increase")) { + options.reject_increase = toml::find(toml_input, "reject_increase"); + } + + return options; +} + NonlinearSolverOptions NonlinearSolverOptions::from_toml(const toml::value& toml_input) { NonlinearSolverOptions options; @@ -59,6 +116,14 @@ NonlinearSolverOptions NonlinearSolverOptions::from_toml(const toml::value& toml toml::find(toml_input, "nl_solver")); } + // Parse the optional trust-region sub-table when using the dogleg solver. + // We always parse the table if present (regardless of nl_solver) so that + // options validation can flag inconsistent configurations later. + if (toml_input.contains("trust_region")) { + options.trust_region = TrustRegionOptions::from_toml( + toml::find(toml_input, "trust_region")); + } + return options; } @@ -123,6 +188,75 @@ bool LinearSolverOptions::validate() const { return true; } +/** + * @brief Validate trust-region option ranges and consistency. + * + * Step-by-step verification: + * 1. Trust-region radius bounds: delta_min must be positive and delta_max + * must exceed delta_min + * 2. Initial radius must lie within [delta_min, delta_max] + * 3. The "good" rho band [xi_lg, xi_ug] must lie inside the "ok" band + * [xi_lo, xi_uo] — otherwise the radius update logic is inconsistent + * 4. Increase factors must be > 1 and decrease factor must be in (0, 1) + * + * Each failure is reported with WARNING_0_OPT pointing to the offending field. + */ +bool TrustRegionOptions::validate() const { + if (delta_min <= 0.0) { + WARNING_0_OPT("Error: TrustRegion table provided a non-positive delta_min"); + return false; + } + + if (delta_max <= delta_min) { + WARNING_0_OPT("Error: TrustRegion table provided delta_max <= delta_min"); + return false; + } + + if (delta_init < delta_min || delta_init > delta_max) { + WARNING_0_OPT("Error: TrustRegion table provided delta_init outside [delta_min, delta_max]"); + return false; + } + + if (xi_lg <= xi_lo) { + WARNING_0_OPT("Error: TrustRegion table requires xi_lg > xi_lo " + "(good band must lie inside ok band)"); + return false; + } + + if (xi_ug >= xi_uo) { + WARNING_0_OPT("Error: TrustRegion table requires xi_ug < xi_uo " + "(good band must lie inside ok band)"); + return false; + } + + if (xi_lg >= xi_ug) { + WARNING_0_OPT("Error: TrustRegion table requires xi_lg < xi_ug"); + return false; + } + + if (xi_lo >= xi_uo) { + WARNING_0_OPT("Error: TrustRegion table requires xi_lo < xi_uo"); + return false; + } + + if (xi_inc <= 1.0) { + WARNING_0_OPT("Error: TrustRegion table requires xi_inc > 1.0"); + return false; + } + + if (xi_dec <= 0.0 || xi_dec >= 1.0) { + WARNING_0_OPT("Error: TrustRegion table requires xi_dec in (0, 1)"); + return false; + } + + if (xi_forced_inc <= 1.0) { + WARNING_0_OPT("Error: TrustRegion table requires xi_forced_inc > 1.0"); + return false; + } + + return true; +} + bool NonlinearSolverOptions::validate() const { if (iter < 1) { WARNING_0_OPT("Error: NonLinearSolver table did not provide a positive iteration count"); @@ -139,13 +273,23 @@ bool NonlinearSolverOptions::validate() const { return false; } - if (nl_solver != NonlinearSolverType::NR && nl_solver != NonlinearSolverType::NRLS) { - WARNING_0_OPT("Error: NonLinearSolver table did not provide a valid nl_solver option (`NR` " - "or `NRLS`)"); + if (nl_solver != NonlinearSolverType::NR && + nl_solver != NonlinearSolverType::NRLS && + nl_solver != NonlinearSolverType::TRDOG) { + WARNING_0_OPT("Error: NonLinearSolver table did not provide a valid nl_solver option " + "(`NR`, `NRLS`, or `TRDOG`)"); return false; } - // Implement validation logic + // If trust-region parameters were supplied, verify they are self-consistent. + // We allow a TRDOG solver without a [trust_region] sub-table — the defaults + // are applied in that case. + if (trust_region.has_value()) { + if (!trust_region->validate()) { + return false; + } + } + return true; } diff --git a/src/postprocessing/postprocessing_driver.cpp b/src/postprocessing/postprocessing_driver.cpp index 8212eb2..138e828 100644 --- a/src/postprocessing/postprocessing_driver.cpp +++ b/src/postprocessing/postprocessing_driver.cpp @@ -531,17 +531,15 @@ void PostProcessingDriver::UpdateFields([[maybe_unused]] const int step, void PostProcessingDriver::Update(const int step, const double time) { CALI_CXX_MARK_SCOPE("postprocessing_update"); UpdateFields(step, time); - // Check if we should output volume averages at this step - if (ShouldOutputAtStep(step)) { - PrintVolValues(time, m_aggregation_mode); - ClearVolumeAverageCache(); - } // Update data collections for visualization - if (m_enable_visualization) { + if (ShouldOutputAtStep(step) && m_enable_visualization) { UpdateDataCollections(step, time); } + PrintVolValues(time, m_aggregation_mode); + ClearVolumeAverageCache(); + if (m_light_up_instances.size() > 0) { UpdateLightUpAnalysis(); } @@ -1281,7 +1279,7 @@ void PostProcessingDriver::CalcElementAvg(mfem::expt::PartialQuadratureFunction* // KEY DIFFERENCE: Get the local-to-global element mapping for partial space auto l2g = pqs->GetLocal2Global().Read(); // Maps local element index to global element index - auto loc_offsets = pqs->getOffsets().Read(); // Offsets for local data layout + auto loc_offsets = pqs->Offsets(mfem::QSpaceOffsetStorage::COMPRESSED).Read(); // Offsets for local data layout // auto global_offsets = (pqs->GetGlobalOffset().Size() > 1) ? // pqs->GetGlobalOffset().Read() : loc_offsets; // Offsets for global // data layout @@ -1393,6 +1391,9 @@ void PostProcessingDriver::InitializeGridFunctions() { const auto gf_name = GetGridFunctionName(reg.display_name, reg_int); // Determine vector dimension from quadrature function const int vdim = reg.region_length[region]; + if (vdim < 1) { + continue; + } max_vdim = (vdim > max_vdim) ? vdim : max_vdim; auto fe_space = GetParFiniteElementSpace(reg_int, vdim); m_map_gfs.emplace(gf_name, @@ -1467,18 +1468,31 @@ void PostProcessingDriver::InitializeDataCollections(ExaOptions& options) { return input.substr(0, pos); }; + auto has_registered_fields = [this](const std::string& display_region_postfix) { + for (const auto& [key, value] : m_map_gfs) { + (void)value; + if (key.find(display_region_postfix) != std::string::npos) { + return true; + } + } + return false; + }; + if (m_aggregation_mode == AggregationMode::PER_REGION || m_aggregation_mode == AggregationMode::BOTH) { for (int region = 0; region < static_cast(m_num_regions); ++region) { auto mesh = m_map_submesh[region]; std::string region_postfix = "region_" + std::to_string(region + 1); std::string display_region_postfix = " " + m_sim_state->GetRegionDisplayName(region); + if (!has_registered_fields(display_region_postfix)) { + continue; + } fs::path output_dir = output_dir_base / region_postfix; fs::path output_dir_vizs = output_dir / m_file_manager->GetBaseFilename(); - if (m_sim_state->IsRegionActive(region)) { - auto region_comm = m_sim_state->GetRegionCommunicator(region); - m_file_manager->EnsureDirectoryExists(output_dir, region_comm); - } + // The subsequent DataCollection::Save() is a parallel operation on the submesh's + // communicator, which is still the parent MPI communicator. Prepare directories on + // that same communicator so all participating ranks observe the same path state. + m_file_manager->EnsureDirectoryExists(output_dir, MPI_COMM_WORLD); std::vector dcs_keys; if (options.visualization.visit) { std::string key = visit_key + region_postfix; @@ -1534,6 +1548,9 @@ void PostProcessingDriver::InitializeDataCollections(ExaOptions& options) { std::string region_postfix = "global"; std::string display_region_postfix = " " + m_sim_state->GetRegionDisplayName(-1); + if (!has_registered_fields(display_region_postfix)) { + return; + } fs::path output_dir = output_dir_base / region_postfix; fs::path output_dir_vizs = output_dir / m_file_manager->GetBaseFilename(); m_file_manager->EnsureDirectoryExists(output_dir); diff --git a/src/postprocessing/postprocessing_file_manager.hpp b/src/postprocessing/postprocessing_file_manager.hpp index f070029..3784f31 100644 --- a/src/postprocessing/postprocessing_file_manager.hpp +++ b/src/postprocessing/postprocessing_file_manager.hpp @@ -186,7 +186,7 @@ class PostProcessingFileManager { auto filepath = GetVolumeAverageFilePath(calc_type, region, region_name); bool file_exists = fs::exists(filepath); - auto file = CreateOutputFile(filepath, true); + auto file = CreateOutputFile(filepath, true, comm); if (file && file->is_open()) { if (!file_exists) { @@ -452,6 +452,7 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di int rank; MPI_Comm_rank(comm, &rank); bool success = false; + std::string path_str; if (rank == 0) { try { // Use weakly_canonical to resolve as much as possible @@ -474,6 +475,7 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di } else { std::cout << "Using existing directory: " << canonical_path << std::endl; output_dir = canonical_path; + path_str = canonical_path.string(); success = true; } } else { @@ -482,6 +484,8 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di success = fs::create_directories(canonical_path); if (success) { output_dir = canonical_path; + path_str = canonical_path.string(); + } else { std::cerr << "Warning: Failed to create output directory: " << canonical_path << std::endl; @@ -513,15 +517,17 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di } // Broadcast the potentially updated output_dir to all ranks - std::string path_str = output_dir.string(); int dir_length = static_cast(path_str.length()); MPI_Bcast(&dir_length, 1, MPI_INT, 0, comm); - path_str.resize(static_cast(dir_length)); - MPI_Bcast(&path_str[0], dir_length, MPI_CHAR, 0, comm); - output_dir = path_str; + if (dir_length > 0) { + path_str.resize(static_cast(dir_length)); + MPI_Bcast(path_str.data(), dir_length, MPI_CHAR, 0, comm); + output_dir = path_str; + } bool success_t = false; - MPI_Allreduce(&success, &success_t, 1, MPI_C_BOOL, MPI_LOR, comm); + MPI_Bcast(&success, 1, MPI_C_BOOL, 0, comm); + success_t = success; return success_t; } diff --git a/src/solvers/trust_region_solver.cpp b/src/solvers/trust_region_solver.cpp new file mode 100644 index 0000000..77fd9fe --- /dev/null +++ b/src/solvers/trust_region_solver.cpp @@ -0,0 +1,350 @@ +// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and +// other ExaConstit Project Developers. See the top-level LICENSE file for details. +// +// SPDX-License-Identifier: MIT + +#include "solvers/trust_region_solver.hpp" + +#include "utilities/mechanics_log.hpp" +#include "utilities/unified_logger.hpp" + +#include "mfem.hpp" +#include "mfem/general/globals.hpp" +#include "mfem/linalg/linalg.hpp" + +#include +#include +#include +#include + +/** + * @brief Compute the Powell dogleg step inside the trust region. + * + * @details Step-by-step algorithm: + * + * 1. **Full Newton step inside trust region**: + * If ||s_N|| <= delta, take the full Newton step. The predicted residual + * is zero (the linear model F + J*s_N = 0 is exactly satisfied). + * + * 2. **Cauchy point outside trust region**: + * Compute the Cauchy point parameters: + * - alpha = ||g||^2 / ||J*g||^2 (optimal scaling along steepest descent) + * - ||s_sd_opt|| = alpha * ||g|| (norm of the optimal Cauchy step) + * If ||s_sd_opt|| >= delta, the optimal Cauchy point is outside the trust + * region. Step along the steepest descent direction to the boundary: + * delx = -delta * g / ||g|| + * The predicted residual norm is computed from the linear model evaluated + * at this truncated Cauchy step. + * + * 3. **Dogleg interpolation (second leg)**: + * Otherwise, interpolate along the line segment from the Cauchy point to + * the Newton point, finding the parameter beta in [0, 1] such that the + * interpolated step lies on the trust-region boundary. The intersection + * is found by solving a quadratic: + * delx(beta) = beta * s_N - (1 - beta) * alpha * g + * ||delx(beta)||^2 = delta^2 + * yielding qa*beta^2 - 2*qb*beta + qc = 0 where: + * qa = ||p||^2, qb = alpha * (p . g), qc = ||s_sd_opt||^2 - delta^2 + * and p = s_N + alpha * g. + * Beta is taken from the larger root and clamped to [0, 1] for safety. + */ +void ExaTrustRegionSolver::Dogleg(double delta, double res_0, double nr_norm, + double Jg_2, const mfem::Vector &grad, + const mfem::Vector &nrStep, mfem::Vector &delx, + double &pred_resid, bool &use_nr) const +{ + use_nr = false; + + // --- Case 1: Full Newton step fits inside the trust region --- + if (nr_norm <= delta) { + use_nr = true; + delx = nrStep; + pred_resid = 0.0; + + if (print_level > 0) { + mfem::out << "TR dogleg: taking full Newton step (||s_N|| = " + << nr_norm << " <= delta = " << delta << ")\n"; + } + return; + } + + // Cauchy point parameters using MPI-aware dot products + const double norm2_grad = Dot(grad, grad); + const double norm_grad = std::sqrt(norm2_grad); + + const double alpha = (Jg_2 > 0.0) ? (norm2_grad / Jg_2) : 1.0; + const double norm_grad_inv = (norm_grad > 0.0) ? (1.0 / norm_grad) : 1.0; + const double norm_s_sd_opt = alpha * norm_grad; + + // --- Case 2: Cauchy point is outside the trust region --- + // Take a step along the steepest descent direction to the trust-region boundary + if (norm_s_sd_opt >= delta) { + // delx = -delta * (grad / ||grad||) + const double factor = -delta * norm_grad_inv; + delx = grad; + delx *= factor; + + // Predicted residual from linear model at the truncated Cauchy step + const double val = -(delta * norm_grad) + + 0.5 * delta * delta * Jg_2 * + (norm_grad_inv * norm_grad_inv); + pred_resid = std::sqrt(std::max(2.0 * val + res_0 * res_0, 0.0)); + + if (print_level > 0) { + mfem::out << "TR dogleg: stepping along first leg (steepest descent)\n"; + } + } + // --- Case 3: Cauchy inside, Newton outside; interpolate along the second leg --- + else { + // Reuse delx as workspace for p = nrStep + alpha * grad + mfem::Vector &p = delx; + add(nrStep, alpha, grad, p); + + // Quadratic coefficients for the trust-region boundary intersection + double qa = Dot(p, p); + double qb = Dot(p, grad) * alpha; + double qc = norm_s_sd_opt * norm_s_sd_opt - delta * delta; + + double discriminant = qb * qb - qa * qc; + double beta = (qa > 0.0) + ? (qb + std::sqrt(std::max(discriminant, 0.0))) / qa + : 0.0; + + // Clamp beta to [0, 1] to handle any roundoff at the boundary + beta = std::max(0.0, std::min(1.0, beta)); + + // delx = beta * nrStep - (1 - beta) * alpha * grad + const double omb = 1.0 - beta; + const double omba = omb * alpha; + add(beta, nrStep, -omba, grad, delx); + + // Predicted residual from linear model at the dogleg step + const double res_cauchy = (Jg_2 > 0.0) + ? std::sqrt(std::max(res_0 * res_0 - alpha * norm2_grad, 0.0)) + : res_0; + pred_resid = omb * res_cauchy; + + if (print_level > 0) { + mfem::out << "TR dogleg: stepping along second leg (beta = " + << beta << ")\n"; + } + } +} + +/** + * @brief Trust-region dogleg Newton iteration implementation. + * + * @details Step-by-step algorithm for solving F(x) = b: + * + * **Initial setup**: + * 1. Validate that operator (oper_mech), preconditioner (prec_mech), and + * delta_ctrl are properly configured + * 2. Allocate all device-aware working vectors (nrStep, grad, delx, Jg_temp, + * x_prev) once before the iteration loop + * 3. Evaluate initial residual r = F(x) - b and compute its norm + * 4. Set the convergence threshold norm_max = max(rel_tol * res, abs_tol) + * 5. Initialize trust-region radius delta from delta_ctrl.deltaInit + * + * **Main iteration loop** (until convergence or max_iter): + * 1. If the previous step was *not* rejected, recompute Newton machinery: + * a. Get Jacobian J = oper_mech->GetGradient(x). The material state is + * consistent with x because Mult(x, r) was just evaluated. + * b. Compute steepest descent: grad = J^T * r (gradient of f = 0.5 ||F||^2) + * c. Compute Jg_2 = ||J * grad||^2 for the optimal Cauchy step length + * d. Solve the Newton system J*c = r via the Krylov solver (prec_mech), + * then negate: nrStep = -c. The negation matches SNLS convention where + * the Newton update is x += nrStep (whereas ExaNewtonSolver uses x -= c). + * e. Compute nr_norm = ||nrStep|| + * If the previous step *was* rejected, all of this data is still valid + * from the last accepted iteration and we just recompute the dogleg with + * the smaller delta. + * 2. Save x_prev = x for potential rollback on rejection + * 3. Compute the dogleg step delx via Dogleg() helper + * 4. Apply the trial step: x = x_prev + delx + * 5. Evaluate residual at the trial point: r = F(x) - b + * 6. Check convergence: if ||r|| <= norm_max, accept and exit + * 7. Update delta via delta_ctrl.UpdateDelta() based on actual vs predicted + * reduction. This may also flag the step for rejection. + * 8. If rejected: restore x = x_prev, restore residual norm, set reject_prev. + * The material state inside the model handles itself analogously to the + * ExaNewtonLSSolver line-search behavior — when Mult() is called again at + * the next trial point, the model recomputes from the beginning-step state. + * + * **Performance Profiling**: + * - "TR_dogleg_solver" scope for overall trust-region solver performance + * - "TR_newton_setup" scope for J^T*r and J*g computations + * - "TR_gradient_transpose" scope for the J^T*r call specifically + * - "TR_newton_solve" scope for the Krylov inner solve + * - "TR_trial_eval" scope for residual evaluations at trial points + * - "krylov_solver" scope for the actual Krylov solver call + * + * @note All scalar quantities (norms, dot products) use MFEM's MPI-aware + * Norm() and Dot() functions through the IterativeSolver base class + */ +void ExaTrustRegionSolver::Mult(const mfem::Vector &b, mfem::Vector &x) const +{ + CALI_CXX_MARK_SCOPE("TR_dogleg_solver"); + MFEM_ASSERT_0(oper_mech, "the Operator is not set (use SetOperator)."); + MFEM_ASSERT_0(prec_mech, "the Solver is not set (use SetSolver)."); + MFEM_ASSERT(delta_ctrl.Validate(), "TrDeltaControl parameters are invalid."); + + const bool have_b = (b.Size() == Height()); + + // --- Allocate working vectors once, reused across iterations --- + mfem::Vector nrStep(width, mfem::Device::GetMemoryType()); + mfem::Vector grad(width, mfem::Device::GetMemoryType()); + mfem::Vector delx(width, mfem::Device::GetMemoryType()); + mfem::Vector Jg_temp(width, mfem::Device::GetMemoryType()); + mfem::Vector x_prev(width, mfem::Device::GetMemoryType()); + + nrStep.UseDevice(true); + grad.UseDevice(true); + delx.UseDevice(true); + Jg_temp.UseDevice(true); + x_prev.UseDevice(true); + + // --- Initial residual evaluation: r = F(x) - b --- + oper_mech->Mult(x, r); + if (have_b) { r -= b; } + + double res = Norm(r); + double res_0 = res; + const double norm_max = std::max(rel_tol * res, abs_tol); + + if (print_level >= 0) { + mfem::out << "TR dogleg: initial ||r|| = " << res << "\n"; + } + + if (res <= norm_max) { + converged = true; + final_iter = 0; + final_norm = res; + return; + } + + // --- Initialize trust-region state --- + double delta = delta_ctrl.deltaInit; + double rho = 0.0; + bool reject_prev = false; + + // Persisted across iterations when a step is not rejected + double Jg_2 = 0.0; + double nr_norm = 0.0; + + int it = 0; + converged = false; + + // --- Main iteration loop --- + while (it < max_iter) { + it++; + + // If the previous step was not rejected, recompute Newton direction + // and steepest descent direction at the current x. The Jacobian data + // is current because oper_mech->Mult(x, r) was just called. + if (!reject_prev) { + CALI_CXX_MARK_SCOPE("TR_newton_setup"); + + mfem::Operator &J = oper_mech->GetGradient(x); + + // Steepest descent direction: grad = J^T * r + // This is the gradient of the merit function f(x) = 0.5 * ||F(x)||^2 + { + CALI_CXX_MARK_SCOPE("TR_gradient_transpose"); + J.MultTranspose(r, grad); + } + + // Compute ||J * grad||^2 for the optimal Cauchy step length + // alpha_cauchy = ||grad||^2 / ||J*grad||^2 + { + J.Mult(grad, Jg_temp); + Jg_2 = Dot(Jg_temp, Jg_temp); + } + + // Solve Newton system: J * c = r, then nrStep = -c + // CGSolver follows the same convention as ExaNewtonSolver where the + // Krylov solve produces c such that the Newton update would be x -= c. + // For the dogleg we need nrStep = -J^{-1}*r, so we negate after the solve. + { + CALI_CXX_MARK_SCOPE("TR_newton_solve"); + c = 0.0; + this->CGSolver(J, r, c); + nrStep = c; + nrStep.Neg(); + } + + nr_norm = Norm(nrStep); + } + + // Save state for potential step rejection + x_prev = x; + + // Compute the dogleg step + double pred_resid = 0.0; + bool use_nr = false; + Dogleg(delta, res_0, nr_norm, Jg_2, grad, nrStep, + delx, pred_resid, use_nr); + + // Apply the trial step: x = x_prev + delx + x = x_prev; + x += delx; + + // Evaluate residual at the trial point + reject_prev = false; + { + CALI_CXX_MARK_SCOPE("TR_trial_eval"); + oper_mech->Mult(x, r); + if (have_b) { r -= b; } + } + + res = Norm(r); + + if (print_level >= 0) { + mfem::out << "TR dogleg: iter " << it + << ", ||r|| = " << res + << ", delta = " << delta + << (use_nr ? " [NR]" : " [DL]") + << "\n"; + } + + // Check convergence + if (res <= norm_max) { + converged = true; + break; + } + + // Update delta from actual vs predicted reduction. May flag for rejection. + bool delta_ok = delta_ctrl.UpdateDelta( + delta, res, res_0, pred_resid, reject_prev, + use_nr, nr_norm, rho, print_level); + + if (!delta_ok) { + if (print_level >= 0) { + mfem::out << "TR dogleg: delta control failure at iter " << it << "\n"; + } + converged = false; + break; + } + + // If the step is rejected, revert x and residual. + // On the next iteration, reject_prev == true so we skip the Newton solve + // and recompute the dogleg with the updated (smaller) delta. The Jacobian, + // grad, nrStep, and Jg_2 are still valid from the last accepted state. + if (reject_prev) { + if (print_level > 0) { + mfem::out << "TR dogleg: rejecting step, reverting to previous state\n"; + } + x = x_prev; + res = res_0; + } + + res_0 = res; + } + + final_iter = it; + final_norm = res; + + if (!converged && print_level >= 0) { + mfem::out << "TR dogleg: failed to converge in " << it + << " iterations, final ||r|| = " << res << "\n"; + } +} \ No newline at end of file diff --git a/src/solvers/trust_region_solver.hpp b/src/solvers/trust_region_solver.hpp new file mode 100644 index 0000000..43950d3 --- /dev/null +++ b/src/solvers/trust_region_solver.hpp @@ -0,0 +1,357 @@ +// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and +// other ExaConstit Project Developers. See the top-level LICENSE file for details. +// +// SPDX-License-Identifier: MIT +#pragma once + +#include "solvers/mechanics_solver.hpp" + +#include "mfem.hpp" +#include "mfem/linalg/solvers.hpp" + +#include +#include +#include + +/** + * @brief Trust-region radius control parameters for the dogleg solver. + * + * @details Ported from SNLS's TrDeltaControl. Controls how the trust-region + * radius delta is updated based on the ratio rho = actual_reduction / predicted_reduction. + * + * The update logic: + * - If rho is in the "good" band [xiLG, xiUG] and the step reduced the residual, + * increase delta (unless the full Newton step was taken) + * - If rho is outside the "ok" band [xiLO, xiUO], decrease delta + * - If the predicted change is zero and delta is not at max, force a small increase + * - If the residual actually increased, reject the step + * + * @ingroup ExaConstit_solvers + */ +struct TrDeltaControl +{ + /// @brief Lower bound of the "good" rho interval (increase delta when rho > xiLG) + double xiLG = 0.75; + /// @brief Upper bound of the "good" rho interval + double xiUG = 1.4; + /// @brief Factor by which to increase delta + double xiIncDelta = 1.5; + /// @brief Lower bound of the "ok" rho interval (decrease delta when rho < xiLO) + double xiLO = 0.35; + /// @brief Upper bound of the "ok" rho interval (decrease delta when rho > xiUO) + double xiUO = 5.0; + /// @brief Factor by which to decrease delta + double xiDecDelta = 0.25; + /// @brief Forced increase factor when predicted change is zero + double xiForcedIncDelta = 1.2; + /// @brief Initial trust-region radius + double deltaInit = 1.0; + /// @brief Minimum allowed trust-region radius (solver fails if hit) + double deltaMin = 1e-12; + /// @brief Maximum allowed trust-region radius + double deltaMax = 1e4; + /// @brief Whether to reject steps that increase the residual + bool rejectResIncrease = true; + + /** + * @brief Validate that the control parameters are self-consistent. + * + * @return true if all parameter relationships are valid, false otherwise + * + * Verifies the following invariants: + * - deltaMin > 0 and deltaMax > deltaMin + * - The "good" rho band [xiLG, xiUG] sits inside the "ok" band [xiLO, xiUO] + * - The increase factor (xiIncDelta) is greater than 1 + * - The decrease factor (xiDecDelta) is in (0, 1) + * - The forced-increase factor is greater than 1 + */ + bool Validate() const + { + return (deltaMin > 0.0) && + (deltaMax > deltaMin) && + (xiLG > xiLO) && + (xiUG < xiUO) && + (xiIncDelta > 1.0) && + (xiDecDelta > 0.0 && xiDecDelta < 1.0) && + (xiForcedIncDelta > 1.0); + } + + /** + * @brief Decrease the trust-region radius after a rejected/poor step. + * + * @param[in,out] delta Current radius, modified on output + * @param[in] norm_full Norm of the full Newton step + * @param[in] took_full Whether the full Newton step was used at the last iteration + * @param[in] print_level Verbosity level for output + * @return true if delta is still above deltaMin, false if solver should fail + * + * @details If the full Newton step was taken, uses a geometric mean blend of + * the current delta and the Newton step norm scaled by xiDecDelta. Otherwise + * just multiplies delta by xiDecDelta. Returns false (and sets delta to deltaMin) + * if the resulting delta drops below the minimum allowed value. + */ + bool DecrDelta(double &delta, double norm_full, bool took_full, + int print_level = 0) const + { + if (took_full) { + double tempa = delta * xiDecDelta; + double tempb = norm_full * xiDecDelta; + delta = std::sqrt(tempa * tempb); + } + else { + delta *= xiDecDelta; + } + + if (delta < deltaMin) { + delta = deltaMin; + if (print_level >= 0) { + mfem::out << "TR: delta at minimum " << delta << "\n"; + } + return false; + } + + if (print_level > 0) { + mfem::out << "TR: decreased delta to " << delta << "\n"; + } + return true; + } + + /** + * @brief Increase the trust-region radius after a successful step. + * + * @param[in,out] delta Current radius, modified on output + * @param[in] print_level Verbosity level for output + * + * @details Multiplies delta by xiIncDelta and clamps at deltaMax. + */ + void IncrDelta(double &delta, int print_level = 0) const + { + delta *= xiIncDelta; + if (delta > deltaMax) { + delta = deltaMax; + if (print_level > 0) { + mfem::out << "TR: delta at maximum " << delta << "\n"; + } + } + else if (print_level > 0) { + mfem::out << "TR: increased delta to " << delta << "\n"; + } + } + + /** + * @brief Update trust-region radius based on actual vs predicted residual change. + * + * @param[in,out] delta Trust-region radius, modified on output + * @param[in] res New residual norm (after the candidate step) + * @param[in] res_0 Previous residual norm (before the candidate step) + * @param[in] pred_resid Predicted residual norm from the dogleg model + * @param[out] reject Whether the step should be rejected (residual increased) + * @param[in] took_full Whether the full Newton step was taken + * @param[in] norm_full Norm of the full Newton step + * @param[out] rho Actual / predicted reduction ratio (output for diagnostics) + * @param[in] print_level Verbosity level for output + * @return true if the delta update succeeded, false if the solver should fail + * + * @details Algorithm (ported from SNLS TrDeltaControl::updateDelta): + * 1. Compute actual_change = res - res_0 and pred_change = pred_resid - res_0 + * 2. If pred_change is exactly zero, force delta larger (or fail if at max) + * 3. Otherwise compute rho = actual_change / pred_change + * 4. If rho is in the "good" band [xiLG, xiUG] and the residual decreased, + * increase delta (unless the full Newton step was already taken) + * 5. If rho is outside the "ok" band [xiLO, xiUO], decrease delta + * 6. If the residual increased and rejectResIncrease is set, mark for rejection + */ + bool UpdateDelta(double &delta, double res, double res_0, + double pred_resid, bool &reject, bool took_full, + double norm_full, double &rho, + int print_level = 0) const + { + bool success = true; + double actual_change = res - res_0; + double pred_change = pred_resid - res_0; + + if (pred_change == 0.0) { + if (delta >= deltaMax) { + if (print_level >= 0) { + mfem::out << "TR: predicted change is zero and delta at max\n"; + } + success = false; + } + else { + if (print_level > 0) { + mfem::out << "TR: predicted change is zero, forcing delta larger\n"; + } + delta = std::min(delta * xiForcedIncDelta, deltaMax); + } + } + else { + rho = actual_change / pred_change; + if (print_level > 0) { + mfem::out << "TR: rho = " << rho << "\n"; + } + + if ((rho > xiLG) && (actual_change < 0.0) && (rho < xiUG)) { + // Step is in the "good" band and residual actually decreased + if (!took_full) { + IncrDelta(delta, print_level); + } + } + else if ((rho < xiLO) || (rho > xiUO)) { + // Step quality is outside the acceptable band; shrink delta + success = DecrDelta(delta, norm_full, took_full, print_level); + } + } + + reject = false; + // Do not make this >=, may have res and res_0 both zero and that is ok + if ((actual_change > 0.0) && rejectResIncrease) { + reject = true; + } + + return success; + } +}; + +/** + * @brief Trust-region dogleg solver for nonlinear solid mechanics problems. + * + * @details This class implements a Powell-dogleg trust-region method for solving + * nonlinear systems F(x) = b. It extends ExaNewtonSolver and reuses the same + * Krylov solver infrastructure (prec_mech) for computing the Newton direction. + * + * The trust-region method augments standard Newton with a globalization strategy + * that interpolates between the steepest descent direction and the full Newton + * step, constrained to a trust-region radius delta. Step quality is monitored + * via the ratio rho = actual_reduction / predicted_reduction, and delta is + * adjusted up or down accordingly. + * + * This is a direct port of SNLS's SNLSTrDlDenseG solver, lifted from the + * material-point dense system to the global FE system. + * + * Algorithm at each iteration: + * 1. Compute steepest descent direction g = J^T * r (gradient of merit f = 0.5 ||F||^2) + * 2. Compute ||J*g||^2 for the optimal Cauchy step length + * 3. Solve J * c = r for the full Newton direction (using prec_mech Krylov solver) + * 4. Compute the dogleg step within the trust region + * 5. Evaluate the residual at the trial point + * 6. Accept or reject based on the rho ratio; update delta accordingly + * + * Requirements: + * - The gradient operator must support MultTranspose (for J^T*r computation). + * This means the assembly mode must be EA, FA, or PA with the native PA + * transpose kernels enabled. + * + * @ingroup ExaConstit_solvers + */ +class ExaTrustRegionSolver : public ExaNewtonSolver +{ + public: + /** + * @brief Default constructor + * + * @details Creates an ExaTrustRegionSolver instance for single-processor + * execution. The operator and linear solver must be set separately using + * SetOperator() and SetSolver(), and the trust-region control parameters + * may be customized via SetTrustRegionControl(). + */ + ExaTrustRegionSolver() { } + +#ifdef MFEM_USE_MPI + /** + * @brief MPI constructor + * + * @param _comm MPI communicator for parallel execution + * + * @details Creates an ExaTrustRegionSolver instance for parallel execution + * using the specified MPI communicator. All trust-region scalar quantities + * (norms, dot products) use MPI-aware reductions through MFEM's Dot/Norm. + */ + ExaTrustRegionSolver(MPI_Comm _comm) : ExaNewtonSolver(_comm) { } +#endif + + /** @brief Use parent class SetOperator methods */ + using ExaNewtonSolver::SetOperator; + + /** @brief Use parent class SetSolver methods */ + using ExaNewtonSolver::SetSolver; + + /** @brief Use parent class CGSolver method (Krylov solve wrapper) */ + using ExaNewtonSolver::CGSolver; + + /** + * @brief Set trust-region control parameters. + * + * @param ctrl TrDeltaControl struct with all tuning parameters + * + * @details Replaces the internal control parameters with a user-supplied + * configuration. Typically called after construction (and before Mult()) + * to wire up parameters parsed from the TOML configuration file. + */ + void SetTrustRegionControl(const TrDeltaControl &ctrl) + { + delta_ctrl = ctrl; + } + + /** + * @brief Get a mutable reference to the trust-region control parameters. + * @return Reference to the internal TrDeltaControl + */ + TrDeltaControl& GetTrustRegionControl() { return delta_ctrl; } + + /** + * @brief Get a const reference to the trust-region control parameters. + * @return Const reference to the internal TrDeltaControl + */ + const TrDeltaControl& GetTrustRegionControl() const { return delta_ctrl; } + + /** + * @brief Solve the nonlinear system F(x) = b using trust-region dogleg method. + * + * @param b Right-hand side vector (if b.Size() != Height(), assumes b = 0) + * @param x Solution vector (input: initial guess, output: converged solution) + * + * @details Implements the trust-region dogleg algorithm. See class-level + * documentation for the algorithm description. The Newton direction is + * computed by the Krylov solver wired in via SetSolver(); J^T*r is + * computed by calling MultTranspose() on the gradient operator. + * + * @pre SetOperator() and SetSolver() must be called before Mult() + * @pre The gradient operator must support MultTranspose (EA/FA mode, or + * PA mode with native transpose kernels) + * + * @post final_iter contains the number of iterations performed + * @post final_norm contains the final residual norm + * @post converged flag indicates whether the solver converged + */ + virtual void Mult(const mfem::Vector &b, mfem::Vector &x) const; + + private: + /** + * @brief Compute the dogleg step given the current trust-region radius. + * + * @param[in] delta Trust-region radius + * @param[in] res_0 Current residual norm + * @param[in] nr_norm Norm of the full Newton step + * @param[in] Jg_2 ||J*g||^2 where g is the steepest descent direction + * @param[in] grad Steepest descent direction g = J^T * r + * @param[in] nrStep Full Newton step + * @param[out] delx The computed dogleg step + * @param[out] pred_resid Predicted residual norm after the step + * @param[out] use_nr Whether the full Newton step was taken + * + * @details Ported from SNLS's dogleg() kernel. The dogleg path interpolates + * between the steepest descent direction (Cauchy point) and the full Newton + * step. Three cases are handled: + * - Newton step inside delta: take full Newton step + * - Cauchy point outside delta: step along steepest descent to boundary + * - Cauchy inside, Newton outside: solve quadratic for the dogleg leg + * intersection with the trust-region boundary + */ + void Dogleg(double delta, double res_0, double nr_norm, + double Jg_2, const mfem::Vector &grad, + const mfem::Vector &nrStep, mfem::Vector &delx, + double &pred_resid, bool &use_nr) const; + + /// @brief Trust-region control parameters (mutable to allow tuning) + mutable TrDeltaControl delta_ctrl; +}; \ No newline at end of file diff --git a/src/system_driver.cpp b/src/system_driver.cpp index 15f4e2b..d7a6934 100644 --- a/src/system_driver.cpp +++ b/src/system_driver.cpp @@ -3,6 +3,7 @@ #include "boundary_conditions/BCData.hpp" #include "boundary_conditions/BCManager.hpp" +#include "solvers/trust_region_solver.hpp" #include "utilities/mechanics_kernels.hpp" #include "utilities/mechanics_log.hpp" #include "utilities/unified_logger.hpp" @@ -45,6 +46,13 @@ void DirBdrFunc(int attr_id, mfem::Vector& y) { namespace { +void GetTrueDofsParallel(const mfem::ParGridFunction& gf, mfem::Vector& true_dofs) { + // used to do something like: + // gf.GetTrueDofs(true_dofs); + // but looks like there are issues with that on the GPUs with newer versions of MFEM + gf.ParallelAverage(true_dofs); +} + /** * @brief Helper function to find mesh bounding box for velocity gradient calculations * @@ -290,29 +298,11 @@ SystemDriver::SystemDriver(std::shared_ptr sim_state) } else { if (linear_solvers.preconditioner == PreconditionerType::AMG) { auto prec_amg = std::make_shared(); - HYPRE_Solver h_amg = static_cast(*prec_amg); - HYPRE_Real st_val = 0.90; - HYPRE_Real rt_val = -10.0; - // HYPRE_Real om_val = 1.0; - // - [[maybe_unused]] int ml = HYPRE_BoomerAMGSetMaxLevels(h_amg, 30); - ml = HYPRE_BoomerAMGSetCoarsenType(h_amg, 0); - ml = HYPRE_BoomerAMGSetMeasureType(h_amg, 0); - ml = HYPRE_BoomerAMGSetStrongThreshold(h_amg, st_val); - ml = HYPRE_BoomerAMGSetNumSweeps(h_amg, 3); - ml = HYPRE_BoomerAMGSetRelaxType(h_amg, 8); - // int rwt = HYPRE_BoomerAMGSetRelaxWt(h_amg, rt_val); - // int ro = HYPRE_BoomerAMGSetOuterWt(h_amg, om_val); - // Dimensionality of our problem - ml = HYPRE_BoomerAMGSetNumFunctions(h_amg, 3); - ml = HYPRE_BoomerAMGSetSmoothType(h_amg, 6); - ml = HYPRE_BoomerAMGSetSmoothNumLevels(h_amg, 3); - ml = HYPRE_BoomerAMGSetSmoothNumSweeps(h_amg, 3); - ml = HYPRE_BoomerAMGSetVariant(h_amg, 0); - ml = HYPRE_BoomerAMGSetOverlap(h_amg, 0); - ml = HYPRE_BoomerAMGSetDomainType(h_amg, 1); - ml = HYPRE_BoomerAMGSetSchwarzRlxWeight(h_amg, rt_val); - + const int problem_dim = m_sim_state->GetMesh()->SpaceDimension(); + const bool order_bynodes = (fe_space->GetOrdering() == mfem::Ordering::byNODES); + // Use MFEM's supported systems-AMG configuration so Hypre sees + // the correct vector-valued DOF ordering on newer MFEM/Hypre builds. + prec_amg->SetSystemsOptions(problem_dim, order_bynodes); prec_amg->SetPrintLevel(linear_solvers.print_level); J_prec = prec_amg; } else if (linear_solvers.preconditioner == PreconditionerType::ILU) { @@ -358,10 +348,47 @@ SystemDriver::SystemDriver(std::shared_ptr sim_state) if (nonlinear_solver.nl_solver == NonlinearSolverType::NR) { newton_solver = std::make_unique( m_sim_state->GetMeshParFiniteElementSpace()->GetComm()); - } else if (nonlinear_solver.nl_solver == NonlinearSolverType::NRLS) { + } + else if (nonlinear_solver.nl_solver == NonlinearSolverType::NRLS) { newton_solver = std::make_unique( m_sim_state->GetMeshParFiniteElementSpace()->GetComm()); } + else if (nonlinear_solver.nl_solver == NonlinearSolverType::TRDOG) { + // Build the trust-region dogleg solver and configure delta-control + // parameters from the parsed TOML options. If the user did not supply + // a [trust_region] sub-table, the solver's internal defaults (matching + // SNLS's TrDeltaControl defaults) are used. + auto tr_solver = std::make_unique( + m_sim_state->GetMeshParFiniteElementSpace()->GetComm()); + + if (nonlinear_solver.trust_region.has_value()) { + const auto& tr_opts = nonlinear_solver.trust_region.value(); + TrDeltaControl ctrl; + ctrl.deltaInit = tr_opts.delta_init; + ctrl.deltaMin = tr_opts.delta_min; + ctrl.deltaMax = tr_opts.delta_max; + ctrl.xiLG = tr_opts.xi_lg; + ctrl.xiUG = tr_opts.xi_ug; + ctrl.xiLO = tr_opts.xi_lo; + ctrl.xiUO = tr_opts.xi_uo; + ctrl.xiIncDelta = tr_opts.xi_inc; + ctrl.xiDecDelta = tr_opts.xi_dec; + ctrl.xiForcedIncDelta = tr_opts.xi_forced_inc; + ctrl.rejectResIncrease = tr_opts.reject_increase; + tr_solver->SetTrustRegionControl(ctrl); + } + + newton_solver = std::move(tr_solver); + + // Sanity check: TRDOG requires gradient transpose support (J^T*r). For + // PA mode, this requires the native PA transpose kernels in the + // integrator. EA and FULL always support transpose. We warn rather than + // hard-fail here because PA support exists once the kernels are wired. + if (options.solvers.assembly == AssemblyType::PA) { + mfem::out << "Note: TRDOG with PA assembly requires native PA transpose " + << "kernels in the gradient operator.\n"; + } + } // Set the newton solve parameters newton_solver->iterative_mode = true; @@ -498,7 +525,7 @@ void SystemDriver::UpdateVelocity() { // pulled off the // VectorFunctionRestrictedCoefficient // populate the solution vector, v_sol, with the true dofs entries in v_cur. - velocity->GetTrueDofs(*vel_tdofs); + GetTrueDofsParallel(*velocity, *vel_tdofs); } if (ess_bdr["ess_vgrad"].Sum() > 0) { @@ -587,7 +614,7 @@ void SystemDriver::UpdateVelocity() { mfem::Vector vel_tdof_tmp(*vel_tdofs); vel_tdof_tmp.UseDevice(true); vel_tdof_tmp = 0.0; - velocity->GetTrueDofs(vel_tdof_tmp); + GetTrueDofsParallel(*velocity, vel_tdof_tmp); mfem::Array ess_tdofs(mech_operator->GetEssentialTrueDofs()); if (!mono_def_flag) { diff --git a/src/utilities/mechanics_kernels.hpp b/src/utilities/mechanics_kernels.hpp index e7d139a..bcb21cf 100644 --- a/src/utilities/mechanics_kernels.hpp +++ b/src/utilities/mechanics_kernels.hpp @@ -542,7 +542,7 @@ double ComputeVolAvgTensorFilterFromPartial(const mfem::expt::PartialQuadratureF // Get the local-to-global element mapping and data layout info auto l2g = pqs->GetLocal2Global().Read(); // Maps local element index to global element index - auto loc_offsets = pqs->getOffsets().Read(); // Offsets for local data layout + auto loc_offsets = pqs->Offsets(mfem::QSpaceOffsetStorage::COMPRESSED).Read(); // Offsets for local data layout auto global_offsets = (pqs->GetGlobalOffset().Size() > 1) ? pqs->GetGlobalOffset().Read() : loc_offsets; // Offsets for global data layout @@ -763,7 +763,7 @@ double ComputeVolAvgTensorFromPartial(const mfem::expt::PartialQuadratureFunctio // Get the local-to-global element mapping and data layout info auto l2g = pqs->GetLocal2Global().Read(); // Maps local element index to global element index - auto loc_offsets = pqs->getOffsets().Read(); // Offsets for local data layout + auto loc_offsets = pqs->Offsets(mfem::QSpaceOffsetStorage::COMPRESSED).Read(); // Offsets for local data layout auto global_offsets = (pqs->GetGlobalOffset().Size() > 1) ? pqs->GetGlobalOffset().Read() : loc_offsets; // Offsets for global data layout