diff --git a/gtsam/base/SymmetricBlockMatrix.cpp b/gtsam/base/SymmetricBlockMatrix.cpp index 6e67418757..96149ab538 100644 --- a/gtsam/base/SymmetricBlockMatrix.cpp +++ b/gtsam/base/SymmetricBlockMatrix.cpp @@ -89,20 +89,30 @@ void SymmetricBlockMatrix::choleskyPartial(DenseIndex nFrontals) { } /* ************************************************************************* */ -VerticalBlockMatrix SymmetricBlockMatrix::split(DenseIndex nFrontals) { +void SymmetricBlockMatrix::split(DenseIndex nFrontals, + VerticalBlockMatrix* RSd) { gttic(VerticalBlockMatrix_split); + assert(RSd); // Construct a VerticalBlockMatrix that contains [R Sd] - const size_t n1 = offset(nFrontals); - VerticalBlockMatrix RSd = VerticalBlockMatrix::LikeActiveViewOf(*this, n1); + const DenseIndex n1 = offset(nFrontals); + assert(RSd->rows() == n1); + assert(RSd->cols() == cols()); + assert(RSd->nBlocks() == nBlocks()); // Copy into it. - RSd.full() = matrix_.topRows(n1); - RSd.full().triangularView().setZero(); + RSd->full() = matrix_.topRows(n1); + RSd->full().triangularView().setZero(); // Take lower-right block of Ab_ to get the remaining factor blockStart() = nFrontals; +} +VerticalBlockMatrix SymmetricBlockMatrix::split(DenseIndex nFrontals) { + // Construct a VerticalBlockMatrix that contains [R Sd] + const DenseIndex n1 = offset(nFrontals); + VerticalBlockMatrix RSd = VerticalBlockMatrix::LikeActiveViewOf(*this, n1); + split(nFrontals, &RSd); return RSd; } @@ -126,6 +136,30 @@ void SymmetricBlockMatrix::updateFromMappedBlocks( } } +/* ************************************************************************* */ +void SymmetricBlockMatrix::updateFromOuterProductBlocks( + const VerticalBlockMatrix& other, + const std::vector& blockIndices) { + assert(static_cast(blockIndices.size()) == other.nBlocks()); + const DenseIndex otherBlocks = other.nBlocks(); + for (DenseIndex i = 0; i < otherBlocks; ++i) { + const DenseIndex I = blockIndices[i]; + if (I < 0) continue; + assert(I < nBlocks()); + const auto Si = other(i); + Matrix diag = Si.transpose() * Si; + updateDiagonalBlock(I, diag); + for (DenseIndex j = i + 1; j < otherBlocks; ++j) { + const DenseIndex J = blockIndices[j]; + if (J < 0) continue; + assert(J < nBlocks()); + const auto Sj = other(j); + Matrix off = Si.transpose() * Sj; + updateOffDiagonalBlock(I, J, off); + } + } +} + /* ************************************************************************* */ } //\ namespace gtsam diff --git a/gtsam/base/SymmetricBlockMatrix.h b/gtsam/base/SymmetricBlockMatrix.h index c6a767012d..97bf3bd817 100644 --- a/gtsam/base/SymmetricBlockMatrix.h +++ b/gtsam/base/SymmetricBlockMatrix.h @@ -234,6 +234,19 @@ namespace gtsam { } } + /// Add a vector to the diagonal entries of block I. + void addToDiagonalBlock(DenseIndex I, const Vector& deltaDiag) { + auto dest = block_(I, I); + assert(dest.rows() == deltaDiag.size()); + dest.diagonal().array() += deltaDiag.array(); + } + + /// Add lambda * I to the diagonal block I. + void addScaledIdentity(DenseIndex I, double lambda) { + auto dest = block_(I, I); + dest.diagonal().array() += lambda; + } + /// Update an off diagonal block. /// NOTE(emmett): This assumes noalias(). template @@ -251,6 +264,12 @@ namespace gtsam { void updateFromMappedBlocks(const SymmetricBlockMatrix& other, const std::vector& blockIndices); + /// Update this matrix with blockwise outer products from a vertical block matrix. + /// Adds S_i^T S_j into block (I,J), using a block mapping; entries with index -1 are skipped. + /// The range to use is controlled by other.firstBlock(). + void updateFromOuterProductBlocks(const VerticalBlockMatrix& other, + const std::vector& blockIndices); + /// @} /// @name Accessing the full matrix. /// @{ @@ -317,6 +336,9 @@ namespace gtsam { */ VerticalBlockMatrix split(DenseIndex nFrontals); + /// I n-place version of split. + void split(DenseIndex nFrontals, VerticalBlockMatrix* RSd); + protected: /// Number of offsets in the full matrix. diff --git a/gtsam/base/tests/testSymmetricBlockMatrix.cpp b/gtsam/base/tests/testSymmetricBlockMatrix.cpp index 4989ea1d84..b2049ada26 100644 --- a/gtsam/base/tests/testSymmetricBlockMatrix.cpp +++ b/gtsam/base/tests/testSymmetricBlockMatrix.cpp @@ -17,6 +17,7 @@ #include #include +#include using namespace std; using namespace gtsam; @@ -187,6 +188,29 @@ TEST(SymmetricBlockMatrix, UpdateFromMappedBlocks) Matrix(doubled.selfadjointView()))); } +/* ************************************************************************* */ +// Update via blockwise outer products from a VerticalBlockMatrix view. +TEST(SymmetricBlockMatrix, UpdateFromOuterProductBlocks) +{ + const std::vector vbmDims{2, 1}; + VerticalBlockMatrix vbm(vbmDims, 4, true); + vbm.matrix() = (Matrix(4, 4) << + 1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12, + 13, 14, 15, 16).finished(); + + const std::vector destDims{1}; + SymmetricBlockMatrix actual(destDims, true); + actual.setZero(); + const std::vector mapping{0, 1}; + const Matrix S = vbm.range(1, 3); + const Matrix expected = S.transpose() * S; + vbm.firstBlock() = 1; + actual.updateFromOuterProductBlocks(vbm, mapping); + EXPECT(assert_equal(expected, Matrix(actual.selfadjointView()))); +} + /* ************************************************************************* */ // In-place inversion path. TEST(SymmetricBlockMatrix, inverseInPlace) { diff --git a/gtsam/linear/MultifrontalClique.cpp b/gtsam/linear/MultifrontalClique.cpp index 29397a52b1..b0a797447c 100644 --- a/gtsam/linear/MultifrontalClique.cpp +++ b/gtsam/linear/MultifrontalClique.cpp @@ -16,6 +16,7 @@ * @date December 2025 */ +#include #include #include #include @@ -81,9 +82,8 @@ bool validateFactorKeys(const GaussianFactorGraph& graph, MultifrontalClique::MultifrontalClique( std::vector factorIndices, const std::weak_ptr& parent, const KeyVector& frontals, - const KeySet& separatorKeys, const KeyDimMap& dims, - const GaussianFactorGraph& graph, VectorValues* solution, - const std::unordered_set* fixedKeys) { + const KeySet& separatorKeys, const KeyDimMap& dims, size_t vbmRows, + VectorValues* solution, const std::unordered_set* fixedKeys) { factorIndices_ = std::move(factorIndices); this->parent = parent; fixedKeys_ = fixedKeys; @@ -111,10 +111,8 @@ MultifrontalClique::MultifrontalClique( cacheSolutionPointers(solution, frontals, separatorKeys); // Pre-allocate matrices once per structure. - std::vector blockDims = - this->blockDims(dims, frontals, separatorKeys); - size_t vbmRows = countRows(graph); - initializeMatrices(blockDims, vbmRows); + blockDims_ = this->blockDims(dims, frontals, separatorKeys); + initializeMatrices(blockDims_, vbmRows); } void MultifrontalClique::finalize(std::vector children) { @@ -136,6 +134,8 @@ void MultifrontalClique::finalize(std::vector children) { indices.push_back(static_cast(orderedKeys_.size())); child.clique->setParentIndices(indices); } + + // Allocation is deferred until load. } DenseIndex MultifrontalClique::blockIndex(Key key) const { @@ -169,24 +169,18 @@ std::vector MultifrontalClique::blockDims( return blockDims; } -size_t MultifrontalClique::countRows(const GaussianFactorGraph& graph) const { - size_t vbmRows = 0; - for (size_t index : factorIndices_) { - assert(index < graph.size()); - if (auto jacobianFactor = - std::dynamic_pointer_cast(graph[index])) { - vbmRows += jacobianFactor->rows(); - } - } - return vbmRows; -} - void MultifrontalClique::initializeMatrices( const std::vector& blockDims, size_t verticalBlockMatrixRows) { - sbm_ = SymmetricBlockMatrix(blockDims, true); Ab_ = VerticalBlockMatrix(blockDims, verticalBlockMatrixRows, true); // Ab's structure is fixed; clear it once and reuse across loads. Ab_.matrix().setZero(); + RSd_ = + VerticalBlockMatrix(blockDims, static_cast(frontalDim), true); +} + +void MultifrontalClique::allocateSbm() { + if (sbm_.nBlocks() > 0) return; + sbm_ = SymmetricBlockMatrix(blockDims_, true); } size_t MultifrontalClique::addJacobianFactor( @@ -233,8 +227,8 @@ void MultifrontalClique::addHessianFactor(const HessianFactor& hessianFactor) { void MultifrontalClique::fillAb(const GaussianFactorGraph& graph) { assert(validateFactorKeys(graph, factorIndices_, orderedKeys_, fixedKeys_)); - sbm_.setZero(); // Zero the active SBM once before accumulating factors. + hessianFactors_.clear(); size_t rowOffset = 0; for (size_t index : factorIndices_) { assert(index < graph.size()); @@ -244,53 +238,119 @@ void MultifrontalClique::fillAb(const GaussianFactorGraph& graph) { rowOffset += addJacobianFactor(*jacobianFactor, rowOffset); } else if (auto hessianFactor = std::dynamic_pointer_cast(gf)) { - addHessianFactor(*hessianFactor); + hessianFactors_.push_back(hessianFactor); } } + + // Lock in QR only for leaf cliques with no Hessian factors. + const bool isLeaf = children.empty(); + const bool useQR = + isLeaf && hessianFactors_.empty() && (frontalDim > 0) && + (frontalDim + separatorDim > kQrAspectRatio * frontalDim) && + (Ab_.matrix().rows() >= static_cast(frontalDim)); + solveMode_ = useQR ? SolveMode::QrLeaf : SolveMode::Cholesky; + RSdReady_ = false; + assert((useQR && RSd_.matrix().rows() == + static_cast(Ab_.matrix().rows()) || + RSd_.matrix().rows() == static_cast(frontalDim))); + allocateSbm(); } -void MultifrontalClique::eliminateInPlace() { - // Update SBM with the local factors, Ab^T * Ab - sbm_.selfadjointView().rankUpdate(Ab_.matrix().transpose()); +void MultifrontalClique::prepareForElimination() { + // QR leaf cliques skip SBM assembly entirely. + if (useQR()) return; + assert(sbm_.nBlocks() > 0); + sbm_.setZero(); + for (const auto& hf : hessianFactors_) { + addHessianFactor(*hf); + } + + if (Ab_.matrix().rows() > 0) { + sbm_.selfadjointView().rankUpdate(Ab_.matrix().transpose()); + } for (const auto& child : children) { if (!child) continue; child->updateParent(*this); } +} + +void MultifrontalClique::factorize() { + if (useQR()) { + // Copy Ab_ to preserve its invariant; QR writes in place. + assert(RSd_.matrix().rows() == Ab_.matrix().rows()); + assert(RSd_.matrix().cols() == Ab_.matrix().cols()); + RSd_.matrix() = Ab_.matrix(); + inplace_QR(RSd_.matrix()); + } else { + sbm_.choleskyPartial(numFrontals()); + sbm_.split(numFrontals(), &RSd_); + sbm_.blockStart() = 0; + } + RSdReady_ = true; +} + +void MultifrontalClique::addIdentityDamping(double lambda) { + const size_t nf = numFrontals(); + for (size_t j = 0; j < nf; ++j) { + sbm_.addScaledIdentity(j, lambda); + } +} - // Form normal equations and factor the frontal block (Schur complement step). - sbm_.choleskyPartial(numFrontals()); +void MultifrontalClique::addDiagonalDamping(double lambda, double minDiagonal, + double maxDiagonal) { + const size_t nf = numFrontals(); + for (size_t j = 0; j < nf; ++j) { + const Vector scaled = + lambda * sbm_.diagonal(j).cwiseMax(minDiagonal).cwiseMin(maxDiagonal); + sbm_.addToDiagonalBlock(j, scaled); + } +} + +void MultifrontalClique::eliminateInPlace() { + prepareForElimination(); + factorize(); } void MultifrontalClique::updateParent(MultifrontalClique& parent) const { - // Expose only the separator+RHS view when contributing to the parent. - assert(sbm_.blockStart() == 0); - sbm_.blockStart() = numFrontals(); - assert(sbm_.nBlocks() == parentIndices_.size()); - parent.sbm_.updateFromMappedBlocks(sbm_, parentIndices_); - sbm_.blockStart() = 0; + if (useQR()) { + // Accumulate separator (and RHS) normal equations (S^T S) into the parent. + assert(RSdReady_); + assert(RSd_.firstBlock() == 0); + assert(parent.sbm_.nBlocks() > 0); + const DenseIndex nfBlocks = static_cast(numFrontals()); + RSd_.firstBlock() = nfBlocks; + parent.sbm_.updateFromOuterProductBlocks(RSd_, parentIndices_); + RSd_.firstBlock() = 0; + } else { + // Accumulate the S^T S part from this clique's SBM into the parent. + assert(sbm_.nBlocks() > 0 && sbm_.blockStart() == 0); + sbm_.blockStart() = numFrontals(); + parent.sbm_.updateFromMappedBlocks(sbm_, parentIndices_); + sbm_.blockStart() = 0; + } } std::shared_ptr MultifrontalClique::conditional() const { - const KeyVector& keys = orderedKeys_; - assert(sbm_.blockStart() == 0); - VerticalBlockMatrix Ab = sbm_.split(numFrontals()); - sbm_.blockStart() = 0; // Split sets it to numFrontals(), reset to 0. - return std::make_shared(keys, numFrontals(), - std::move(Ab)); + assert(RSdReady_); + // RSd_ is cached at elimination time. + return std::make_shared(orderedKeys_, numFrontals(), + RSd_); } // Solve with block back-substitution on the Cholesky-stored SBM. void MultifrontalClique::updateSolution() const { + assert(RSdReady_); + // Use cached [R S d] for fast back-substitution. const size_t nf = numFrontals(); - const size_t n = sbm_.nBlocks() - 1; // # frontals + # separators + const size_t n = RSd_.nBlocks() - 1; // # frontals + # separators // The in-place factorization yields an upper-triangular system [R S d]: // R * x_f + S * x_s = d, // with x_f the frontals and x_s the separators. - const auto R = sbm_.triangularView(0, nf); - const auto S = sbm_.aboveDiagonalRange(0, nf, nf, n); - const auto d = sbm_.aboveDiagonalRange(0, nf, n, n + 1); + const auto R = RSd_.range(0, nf).triangularView(); + const auto S = RSd_.range(nf, n); + const auto d = RSd_.range(n, n + 1); // We first solve rhs = d - S * x_s rhsScratch_.noalias() = d; diff --git a/gtsam/linear/MultifrontalClique.h b/gtsam/linear/MultifrontalClique.h index bc5c5f527d..f9e6c3029d 100644 --- a/gtsam/linear/MultifrontalClique.h +++ b/gtsam/linear/MultifrontalClique.h @@ -43,12 +43,13 @@ using KeyDimMap = std::map; namespace internal { -/// Helper class to track original factor indices. +/// Helper class to track original factor indices and row counts. class IndexedSymbolicFactor : public SymbolicFactor { public: size_t index_; - IndexedSymbolicFactor(const KeyVector& keys, size_t index) - : SymbolicFactor(), index_(index) { + size_t rows_; + IndexedSymbolicFactor(const KeyVector& keys, size_t index, size_t rows) + : SymbolicFactor(), index_(index), rows_(rows) { keys_ = keys; } }; @@ -89,15 +90,14 @@ class GTSAM_EXPORT MultifrontalClique { /// @param frontals Frontal keys for this clique. /// @param separatorKeys Separator keys for this clique. /// @param dims Key->dimension map. - /// @param graph Factor graph for sizing and constraints. + /// @param vbmRows Number of rows needed for the vertical block matrix. /// @param solution Solution storage for cached pointers. /// @param fixedKeys Keys fixed to zero by constraints (may be null). explicit MultifrontalClique(std::vector factorIndices, const std::weak_ptr& parent, const KeyVector& frontals, const KeySet& separatorKeys, - const KeyDimMap& dims, - const GaussianFactorGraph& graph, + const KeyDimMap& dims, size_t vbmRows, VectorValues* solution, const std::unordered_set* fixedKeys); @@ -107,10 +107,31 @@ class GTSAM_EXPORT MultifrontalClique { /// Cache the children list and compute parent indices. void finalize(std::vector children); - /// Load factor values into the pre-allocated Ab matrix and Hessians into - /// sbm_. + /// Load factor values into the pre-allocated Ab matrix. /// @param graph The factor graph with updated values. void fillAb(const GaussianFactorGraph& graph); + + /// Zero out sbm, re-add Hessians, accumulate Jacobians and children. + void prepareForElimination(); + + /// Perform Cholesky factorization on the frontal block. + void factorize(); + + /** + * Add identity damping to the frontal block. + * @param lambda Damping factor + */ + void addIdentityDamping(double lambda); + + /** + * Add diagonal damping to the frontal block. + * @param lambda Damping factor + * @param minDiagonal Minimum diagonal value + * @param maxDiagonal Maximum diagonal value + */ + void addDiagonalDamping(double lambda, double minDiagonal, + double maxDiagonal); + /// @} /// @name Read-only methods @@ -130,18 +151,11 @@ class GTSAM_EXPORT MultifrontalClique { /// Get the vertical block matrix Ab. const VerticalBlockMatrix& Ab() const { return Ab_; } - /// Get the symmetric block matrix (mutable). - SymmetricBlockMatrix& sbm() { return sbm_; } - /// Get the symmetric block matrix (const). const SymmetricBlockMatrix& sbm() const { return sbm_; } - /** - * Count rows needed for the vertical block matrix. - * @param graph The factor graph. - * @return Total number of rows. - */ - size_t countRows(const GaussianFactorGraph& graph) const; + /// Check if this clique is using QR elimination. + bool useQR() const { return solveMode_ == SolveMode::QrLeaf; } /** * Print this clique. @@ -185,6 +199,8 @@ class GTSAM_EXPORT MultifrontalClique { const MultifrontalClique& clique); private: + enum class SolveMode { Cholesky, QrLeaf }; + /// Cache pointers to frontal and separator update vectors. void cacheSolutionPointers(VectorValues* delta, const KeyVector& frontals, const KeySet& separatorKeys); @@ -205,6 +221,9 @@ class GTSAM_EXPORT MultifrontalClique { void initializeMatrices(const std::vector& blockDims, size_t totalNumRows); + /// Allocate the symmetric block matrix if needed. + void allocateSbm(); + /** * Add a Jacobian factor's contributions into the Ab matrix. * @return Number of rows added. @@ -217,20 +236,33 @@ class GTSAM_EXPORT MultifrontalClique { void setParentIndices(const std::vector& indices) { parentIndices_ = indices; } - VerticalBlockMatrix Ab_; - mutable SymmetricBlockMatrix sbm_; - mutable Vector rhsScratch_; ///< Cached RHS workspace for back-substitution. - mutable Vector - separatorScratch_; ///< Cached separator stack for back-substitution. - + // Build-time metadata. std::vector factorIndices_; KeyVector orderedKeys_; ///< Keys ordered by block index (frontals+seps). const std::unordered_set* fixedKeys_ = nullptr; + std::vector blockDims_; std::vector parentIndices_; ///< Parent block indices for separators + RHS. std::vector frontalPtrs_; ///< Pointers into solution frontals. std::vector separatorPtrs_; ///< Pointers into solution separator. + + // Load-time state. + VerticalBlockMatrix Ab_; + std::vector hessianFactors_; ///< Hessian factors. + SolveMode solveMode_ = SolveMode::Cholesky; + + // Elimination-time state. + mutable SymmetricBlockMatrix sbm_; + mutable VerticalBlockMatrix RSd_; ///< Cached [R S d] from elimination. + mutable bool RSdReady_ = false; + + // Solve-time scratch space. + mutable Vector rhsScratch_; ///< Cached RHS workspace for back-substitution. + mutable Vector + separatorScratch_; ///< Cached separator stack for back-substitution. + + static constexpr double kQrAspectRatio = 2.0; }; std::ostream& operator<<(std::ostream& os, const MultifrontalClique& clique); diff --git a/gtsam/linear/MultifrontalSolver.cpp b/gtsam/linear/MultifrontalSolver.cpp index 6cf6b2d459..ae84dce926 100644 --- a/gtsam/linear/MultifrontalSolver.cpp +++ b/gtsam/linear/MultifrontalSolver.cpp @@ -137,6 +137,15 @@ std::map computeDims(const GaussianFactorGraph& graph) { return dims; } +size_t factorRowCount(const GaussianFactorGraph& graph, size_t index) { + if (index >= graph.size() || !graph[index]) return 0; + if (auto jacobianFactor = + std::dynamic_pointer_cast(graph[index])) { + return jacobianFactor->rows(); + } + return 0; +} + // Build SymbolicFactorGraph from GaussianFactorGraph SymbolicFactorGraph buildSymbolicGraph( const GaussianFactorGraph& graph, @@ -154,7 +163,8 @@ SymbolicFactorGraph buildSymbolicGraph( } // Skip factors that are fully constrained away. if (keys.empty()) continue; - symbolicGraph.emplace_shared(keys, i); + symbolicGraph.emplace_shared( + keys, i, factorRowCount(graph, i)); } return symbolicGraph; } @@ -238,19 +248,6 @@ void reportStructure(const SymbolicJunctionTree& junctionTree, stats.report(name, reportStream); } -std::vector factorIndicesForSymbolicCluster( - const SymbolicJunctionTree::sharedNode& cluster) { - std::vector indices; - indices.reserve(cluster->factors.size()); - for (const auto& factor : cluster->factors) { - assert(factor); - auto indexed = - std::static_pointer_cast(factor); - indices.push_back(indexed->index_); - } - return indices; -} - struct BuiltClique { MultifrontalSolver::CliquePtr clique; KeySet separatorKeys; @@ -259,7 +256,6 @@ struct BuiltClique { // Build cliques from a symbolic junction tree and wire parent/child metadata. struct CliqueBuilder { const std::map& dims; - const GaussianFactorGraph& graph; VectorValues* solution; std::vector* cliques; const std::unordered_set* fixedKeys; @@ -272,11 +268,21 @@ struct CliqueBuilder { // Gather symbolic metadata for this clique. const KeyVector& frontals = cluster->orderedFrontalKeys; const KeySet& separatorKeys = cluster->separatorKeys(&separatorCache); + std::vector factorIndices; + factorIndices.reserve(cluster->factors.size()); + size_t vbmRows = 0; + for (const auto& factor : cluster->factors) { + assert(factor); + auto indexed = + std::static_pointer_cast(factor); + factorIndices.push_back(indexed->index_); + vbmRows += indexed->rows_; + } // Create the clique node and cache static structure. auto clique = std::make_shared( - factorIndicesForSymbolicCluster(cluster), parent, frontals, - separatorKeys, dims, graph, solution, fixedKeys); + std::move(factorIndices), parent, frontals, separatorKeys, dims, + vbmRows, solution, fixedKeys); // Build children and collect separator keys. std::vector childInfos; @@ -300,21 +306,23 @@ struct CliqueBuilder { /* ************************************************************************* */ MultifrontalSolver::MultifrontalSolver(const GaussianFactorGraph& graph, + const Ordering& ordering, + size_t mergeDimCap, + std::ostream* reportStream) + : MultifrontalSolver(Precompute(graph, ordering), ordering, mergeDimCap, + reportStream) {} + +/* ************************************************************************* */ +MultifrontalSolver::MultifrontalSolver(PrecomputedData data, const Ordering& ordering, size_t mergeDimCap, std::ostream* reportStream) { - // Pre-compute variable dimensions and fixed keys. - dims_ = computeDims(graph); - fixedKeys_ = collectFixedKeys(graph); + dims_ = std::move(data.dims); + fixedKeys_ = std::move(data.fixedKeys); // Seed the cached solution with zero vectors for all variables. - Ordering reducedOrdering; - reducedOrdering.reserve(ordering.size()); for (Key key : ordering) { solution_.insert(key, Vector::Zero(dims_.at(key))); - if (!fixedKeys_.count(key)) { - reducedOrdering.push_back(key); - } } for (Key key : fixedKeys_) { if (!solution_.exists(key)) { @@ -322,29 +330,22 @@ MultifrontalSolver::MultifrontalSolver(const GaussianFactorGraph& graph, } } - // Convert to SymbolicFactorGraph to build the elimination tree - SymbolicFactorGraph symbolicGraph = buildSymbolicGraph(graph, fixedKeys_); - - // Build SymbolicEliminationTree and then SymbolicJunctionTree - SymbolicEliminationTree eliminationTree(symbolicGraph, reducedOrdering); - SymbolicJunctionTree junctionTree(eliminationTree); - // Report the symbolic structure before any merge. - reportStructure(junctionTree, dims_, "Symbolic cluster structure", + reportStructure(data.junctionTree, dims_, "Symbolic cluster structure", reportStream); // If applicable, merge small child cliques bottom-up. if (mergeDimCap > 0) { - for (const auto& rootCluster : junctionTree.roots()) { + for (const auto& rootCluster : data.junctionTree.roots()) { mergeSmallClusters(rootCluster, dims_, mergeDimCap); } - reportStructure(junctionTree, dims_, "Clique structure after merge", + reportStructure(data.junctionTree, dims_, "Clique structure after merge", reportStream); } // Build the actual MultifrontalClique structure. - CliqueBuilder builder{dims_, graph, &solution_, &cliques_, &fixedKeys_}; - for (const auto& rootCluster : junctionTree.roots()) { + CliqueBuilder builder{dims_, &solution_, &cliques_, &fixedKeys_}; + for (const auto& rootCluster : data.junctionTree.roots()) { if (rootCluster) { roots_.push_back( builder.build(rootCluster, std::weak_ptr()) @@ -352,8 +353,29 @@ MultifrontalSolver::MultifrontalSolver(const GaussianFactorGraph& graph, } } - // Load initial numerical values after the structure is built. - load(graph); + // Caller is responsible for loading numerical values via load(). +} + +/* ************************************************************************* */ +MultifrontalSolver::PrecomputedData MultifrontalSolver::Precompute( + const GaussianFactorGraph& graph, const Ordering& ordering) { + auto dims = computeDims(graph); + auto fixedKeys = collectFixedKeys(graph); + + Ordering reducedOrdering; + reducedOrdering.reserve(ordering.size()); + for (Key key : ordering) { + if (!fixedKeys.count(key)) { + reducedOrdering.push_back(key); + } + } + + SymbolicFactorGraph symbolicGraph = buildSymbolicGraph(graph, fixedKeys); + SymbolicEliminationTree eliminationTree(symbolicGraph, reducedOrdering); + SymbolicJunctionTree junctionTree(eliminationTree); + + return MultifrontalSolver::PrecomputedData{ + std::move(dims), std::move(fixedKeys), std::move(junctionTree)}; } /* ************************************************************************* */ @@ -361,10 +383,18 @@ void MultifrontalSolver::load(const GaussianFactorGraph& graph) { for (auto& clique : cliques_) { clique->fillAb(graph); } + loaded_ = true; + eliminated_ = false; } /* ************************************************************************* */ void MultifrontalSolver::eliminateInPlace() { + if (!loaded_) { + throw std::runtime_error( + "MultifrontalSolver::eliminateInPlace: load() must be called before " + "eliminating."); + } + eliminated_ = false; // Parallel elimination uses PostOrderForestParallel, which will be // multi-threaded if GTSAM was compiled with TBB. TbbOpenMPMixedScope threadLimiter; @@ -372,10 +402,26 @@ void MultifrontalSolver::eliminateInPlace() { if (node) node->eliminateInPlace(); }; treeTraversal::PostOrderForestParallel(*this, visitorPost, 10); + eliminated_ = true; +} + +/* ************************************************************************* */ +void MultifrontalSolver::eliminateInPlace(const GaussianFactorGraph& graph) { + // Combine load + eliminate in one post-order traversal to improve locality. + TbbOpenMPMixedScope threadLimiter; + auto visitorPost = [&graph](const CliquePtr& node) { + if (!node) return; + node->fillAb(graph); + node->eliminateInPlace(); + }; + treeTraversal::PostOrderForestParallel(*this, visitorPost, 10); + loaded_ = true; + eliminated_ = true; } /* ************************************************************************* */ GaussianBayesTree MultifrontalSolver::computeBayesTree() const { + assert(loaded_ && eliminated_); GaussianBayesTree bayesTree; using Clique = GaussianBayesTreeClique; using BayesCliquePtr = GaussianBayesTree::sharedClique; @@ -417,6 +463,7 @@ GaussianBayesTree MultifrontalSolver::computeBayesTree() const { /* ************************************************************************* */ const VectorValues& MultifrontalSolver::updateSolution() const { + assert(loaded_ && eliminated_); // Parallel solve uses treeTraversal::DepthFirstForestParallel (Pre-order / // Top-Down). TbbOpenMPMixedScope threadLimiter; @@ -427,11 +474,16 @@ const VectorValues& MultifrontalSolver::updateSolution() const { }; auto visitorPost = [](const CliquePtr&, int) {}; + // Threshold chosen to balance task overhead vs parallelism. Small cliques + // (like points in SFM) are better handled sequentially within larger tasks + // to minimize scheduling overhead and maximize cache locality. + constexpr int kSolutionParallelThreshold = 4096; + // Cast to non-const because treeTraversal expects a non-const reference, // even though we are only calling const methods on the nodes. treeTraversal::DepthFirstForestParallel( const_cast(*this), rootData, visitorPre, visitorPost, - 10); + kSolutionParallelThreshold); for (Key key : fixedKeys_) { solution_.at(key).setZero(); diff --git a/gtsam/linear/MultifrontalSolver.h b/gtsam/linear/MultifrontalSolver.h index 3880daea7a..57e2291304 100644 --- a/gtsam/linear/MultifrontalSolver.h +++ b/gtsam/linear/MultifrontalSolver.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -43,22 +44,31 @@ class MultifrontalClique; */ class GTSAM_EXPORT MultifrontalSolver { public: + struct PrecomputedData { + std::map dims; ///< Map from variable key to dimension. + std::unordered_set fixedKeys; ///< Keys fixed by constrained factors. + SymbolicJunctionTree junctionTree; ///< Precomputed symbolic junction tree. + }; + /// Shared pointer to a MultifrontalClique. using CliquePtr = std::shared_ptr; /// Node type for tree traversal utilities. using Node = MultifrontalClique; - private: + protected: std::vector roots_; ///< Roots of the elimination tree. std::vector cliques_; ///< All cliques in the solver. std::map dims_; ///< Map from variable key to dimension. mutable VectorValues solution_; ///< Cached solution vector. std::unordered_set fixedKeys_; ///< Keys fixed by constrained factors. + bool loaded_ = false; ///< Whether load() has been called. + bool eliminated_ = false; ///< Whether eliminateInPlace() ran. public: /** * Construct the solver from a factor graph and an ordering. * This builds the symbolic junction tree and pre-allocates all matrices. + * Call load() before eliminating to populate numerical values. * @param graph The factor graph to solve. * @param ordering The variable ordering to use for elimination. * @param mergeDimCap Merge a child if its frontal dimension plus the @@ -70,6 +80,24 @@ class GTSAM_EXPORT MultifrontalSolver { size_t mergeDimCap = 0, std::ostream* reportStream = nullptr); + /** + * Construct the solver from precomputed symbolic data. + * Call load() before eliminating to populate numerical values. + * @param data Precomputed symbolic structure and sizing data. + * @param ordering The variable ordering to use for seeding solution storage. + * @param mergeDimCap Merge a child if its frontal dimension plus the + * parent's total dimension is below this threshold (0 disables merging). + * @param reportStream Optional stream to report clique structure stats + * (frontals, separators, total dims, and children). + */ + MultifrontalSolver(PrecomputedData data, const Ordering& ordering, + size_t mergeDimCap = 0, + std::ostream* reportStream = nullptr); + + /// Precompute symbolic structure and sizing data from a factor graph. + static PrecomputedData Precompute(const GaussianFactorGraph& graph, + const Ordering& ordering); + /** * Load new numerical values from the factor graph. * This overwrites the values in the pre-allocated matrices. @@ -84,6 +112,12 @@ class GTSAM_EXPORT MultifrontalSolver { */ void eliminateInPlace(); + /** + * Load and eliminate the graph in a single traversal. + * This calls fillAb() and eliminateInPlace() per clique in post-order. + */ + void eliminateInPlace(const GaussianFactorGraph& graph); + /** * Compute a Bayes tree from the in-place Cholesky factorization. * Requires eliminateInPlace() to have been called beforehand. diff --git a/gtsam/linear/tests/testMultifrontalSolver.cpp b/gtsam/linear/tests/testMultifrontalSolver.cpp index afe5d8e7a8..e8708fff7e 100644 --- a/gtsam/linear/tests/testMultifrontalSolver.cpp +++ b/gtsam/linear/tests/testMultifrontalSolver.cpp @@ -50,9 +50,10 @@ const Ordering chainOrdering{x2, x1, x3, x4}; } // namespace /* ************************************************************************* */ -// Build the solver and validate initial structure and load. +// Build the solver and validate initial structure and explicit load. TEST(MultifrontalSolver, Constructor) { MultifrontalSolver solver(chain, chainOrdering); + solver.load(chain); // Verify roots EXPECT(solver.roots().size() == 1); @@ -78,6 +79,33 @@ TEST(MultifrontalSolver, Constructor) { EXPECT(assert_equal((Matrix(2, 1) << 2., 1.).finished(), Ab)); } +/* ************************************************************************* */ +// Build the solver from precomputed data and validate structure and load. +TEST(MultifrontalSolver, ConstructorPrecomputed) { + auto data = MultifrontalSolver::Precompute(chain, chainOrdering); + MultifrontalSolver solver(std::move(data), chainOrdering); + solver.load(chain); + + // Verify roots + EXPECT(solver.roots().size() == 1); + auto root = solver.roots()[0]; + EXPECT(root != nullptr); + + // Root should have 1 child {x2, x1} + EXPECT_LONGS_EQUAL(1, root->children.size()); + auto childClique = root->children[0]; + + // Verify matrices in leaf (childClique) + CHECK(childClique->useQR() == false); + EXPECT_LONGS_EQUAL(4, childClique->sbm().nBlocks()); + EXPECT_LONGS_EQUAL(2, childClique->Ab().rows()); + EXPECT_LONGS_EQUAL(4, childClique->Ab().nBlocks()); + + // Verify load for childClique + Matrix A0 = childClique->Ab()(0); + EXPECT(assert_equal((Matrix(2, 1) << 2., 1.).finished(), A0)); +} + /* ************************************************************************* */ // Reload numerical values and ensure Ab updates match whitening. TEST(MultifrontalSolver, Load) { @@ -110,6 +138,7 @@ TEST(MultifrontalSolver, Load) { // Compare solver output against multifrontal elimination baseline. TEST(MultifrontalSolver, Eliminate) { MultifrontalSolver solver(chain, chainOrdering); + solver.load(chain); solver.eliminateInPlace(); // Solve @@ -122,10 +151,25 @@ TEST(MultifrontalSolver, Eliminate) { EXPECT(assert_equal(expected, actual, 1e-9)); } +/* ************************************************************************* */ +// Load + eliminate in one traversal matches standard elimination. +TEST(MultifrontalSolver, EliminateWithLoad) { + MultifrontalSolver solver(chain, chainOrdering); + solver.eliminateInPlace(chain); + + const VectorValues& actual = solver.updateSolution(); + + GaussianBayesTree expectedBT = *chain.eliminateMultifrontal(chainOrdering); + VectorValues expected = expectedBT.optimize(); + + EXPECT(assert_equal(expected, actual, 1e-9)); +} + /* ************************************************************************* */ // Compare marginals from in-place Bayes tree against standard elimination. TEST(MultifrontalSolver, ComputeBayesTreeMarginals) { MultifrontalSolver solver(chain, chainOrdering); + solver.load(chain); solver.eliminateInPlace(); GaussianBayesTree actualBT = solver.computeBayesTree(); @@ -161,6 +205,7 @@ TEST(MultifrontalSolver, ComputeBayesTreeMarginalsConstrainedChain) { x2, I_1x1, (Vector(1) << 0.0).finished(), hardConstraint); MultifrontalSolver solver(constrainedChain, chainOrdering); + solver.load(constrainedChain); solver.eliminateInPlace(); GaussianBayesTree actualBT = solver.computeBayesTree(); @@ -189,6 +234,7 @@ TEST(MultifrontalSolver, ConstrainedNoiseFeasible) { const Ordering ordering{x1}; MultifrontalSolver solver(graph, ordering); + solver.load(graph); solver.eliminateInPlace(); const VectorValues& actual = solver.updateSolution(); @@ -225,6 +271,7 @@ TEST(MultifrontalSolver, ConstrainedNoiseUnaryFeasible) { const Ordering ordering{x1}; MultifrontalSolver solver(graph, ordering); + solver.load(graph); solver.eliminateInPlace(); const VectorValues& actual = solver.updateSolution(); @@ -261,6 +308,7 @@ TEST(MultifrontalSolver, WeightedScalarMeasurements) { const Ordering ordering{x1}; MultifrontalSolver solver(graph, ordering); + solver.load(graph); solver.eliminateInPlace(); const VectorValues& actual = solver.updateSolution(); @@ -276,6 +324,7 @@ TEST(MultifrontalSolver, HessianFactors) { const Ordering ordering{x1}; MultifrontalSolver solver(graph, ordering); + solver.load(graph); solver.eliminateInPlace(); const VectorValues& actual = solver.updateSolution(); @@ -304,6 +353,7 @@ TEST(MultifrontalSolver, BalancedSmoother) { const Ordering ordering{X(1), X(3), X(5), X(7), X(2), X(6), X(4)}; MultifrontalSolver solver(smoother, ordering); + solver.load(smoother); // Verify roots EXPECT(solver.roots().size() == 1); diff --git a/timing/timeMultifrontalSolver.cpp b/timing/timeMultifrontalSolver.cpp index 598c87873b..5428a7a709 100644 --- a/timing/timeMultifrontalSolver.cpp +++ b/timing/timeMultifrontalSolver.cpp @@ -49,64 +49,78 @@ static void runStandardSolver(const GaussianFactorGraph& smoother, /// Run new MultifrontalSolver elimination and optimization. static void runMultifrontalSolver(MultifrontalSolver& solver, - const GaussianFactorGraph& smoother, + const GaussianFactorGraph& graph, size_t iterations) { for (size_t i = 0; i < iterations; ++i) { - if (i > 0) solver.load(smoother); - solver.eliminateInPlace(); + solver.eliminateInPlace(graph); const VectorValues& solution = solver.updateSolution(); (void)solution; } } -int main() { - cout << "Merging dim cap " << kMergeDimCap << std::endl; +namespace { +const std::string bal135 = findExampleDataFile("dubrovnik-135-90642-pre.txt"); +} + +void runBAL135Benchmark() { + const size_t iterations = 1; + cout << "\nSingle MFS test: " << bal135 << " (iterations=" << iterations + << ")" << std::endl; + + const SfmData db = SfmData::FromBalFile(bal135); + const NonlinearFactorGraph graph = buildGeneralSfmGraph(db, 0.1); + const Values initial = buildGeneralSfmInitial(db); + const GaussianFactorGraph linear = *graph.linearize(initial); + const Ordering ordering = Ordering::Metis(linear); + + MultifrontalSolver solver(linear, ordering, kMergeDimCap, nullptr); + auto start = std::chrono::high_resolution_clock::now(); + runMultifrontalSolver(solver, linear, iterations); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration t_imperative = end - start; + cout << " MultifrontalSolver: " << t_imperative.count() << " s" << std::endl; + tictoc_print(); +} - { - const size_t bal_iterations = 5; - const string bal16 = findExampleDataFile("dubrovnik-16-22106-pre"); - const string bal88 = findExampleDataFile("dubrovnik-88-64298-pre"); - for (const auto& filename : {bal16, bal88}) { - cout << "\nProcessing BAL file: " << filename << std::endl; - const SfmData db = SfmData::FromBalFile(filename); - const NonlinearFactorGraph graph = buildGeneralSfmGraph(db, 0.1); - const Values initial = buildGeneralSfmInitial(db); - const GaussianFactorGraph linear = *graph.linearize(initial); - - const std::vector> orderings = { - {"Burn", createSchurOrdering(db, false)}, - {"Metis", Ordering::Metis(linear)}, - {"Schur", createSchurOrdering(db, false)}, - {"Colamd", Ordering::Colamd(linear)}, - }; - - for (const auto& [label, ordering] : orderings) { - cout << "\nBAL Benchmark (" << label - << ", iterations=" << bal_iterations << "):" << std::endl; - - auto start = std::chrono::high_resolution_clock::now(); - MultifrontalSolver solver(linear, ordering, kMergeDimCap, nullptr); - runMultifrontalSolver(solver, linear, bal_iterations); - auto end = std::chrono::high_resolution_clock::now(); - std::chrono::duration t_imperative = end - start; - cout << " MultifrontalSolver: " << t_imperative.count() << " s" - << std::endl; - - start = std::chrono::high_resolution_clock::now(); - runStandardSolver(linear, ordering, bal_iterations); - end = std::chrono::high_resolution_clock::now(); - std::chrono::duration t_standard = end - start; - cout << " Standard GTSAM: " << t_standard.count() << " s" - << std::endl; - - cout << " Speedup: " - << t_standard.count() / t_imperative.count() << "x" << std::endl; - } +void runBALBenchmark() { + const size_t bal_iterations = 2; + const string bal16 = findExampleDataFile("dubrovnik-16-22106-pre"); + const string bal88 = findExampleDataFile("dubrovnik-88-64298-pre"); + for (const auto& filename : {bal16, bal88, bal135}) { + cout << "\nProcessing BAL file: " << filename << std::endl; + const SfmData db = SfmData::FromBalFile(filename); + const NonlinearFactorGraph graph = buildGeneralSfmGraph(db, 0.1); + const Values initial = buildGeneralSfmInitial(db); + const GaussianFactorGraph linear = *graph.linearize(initial); + + auto orderings = createOrderings(db, linear); + for (const auto& [label, ordering] : orderings) { + cout << "\nBAL Benchmark (" << label << ", iterations=" << bal_iterations + << "):" << std::endl; + + MultifrontalSolver solver(linear, ordering, kMergeDimCap, nullptr); + auto start = std::chrono::high_resolution_clock::now(); + runMultifrontalSolver(solver, linear, bal_iterations); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration t_imperative = end - start; + cout << " MultifrontalSolver: " << t_imperative.count() << " s" + << std::endl; + + start = std::chrono::high_resolution_clock::now(); + runStandardSolver(linear, ordering, bal_iterations); + end = std::chrono::high_resolution_clock::now(); + std::chrono::duration t_standard = end - start; + cout << " Standard GTSAM: " << t_standard.count() << " s" + << std::endl; + + cout << " Speedup: " + << t_standard.count() / t_imperative.count() << "x" << std::endl; } } - +} +void runChainBenchmark() { const std::vector T_values = {10, 50, 100, 500, 1000, 5000}; - const size_t iterations = 1000; + const size_t iterations = 500; for (size_t T : T_values) { cout << "\nBenchmark (T=" << T << ", iterations=" << iterations @@ -132,5 +146,12 @@ int main() { cout << " Speedup: " << t_standard.count() / t_imperative.count() << "x" << std::endl; } +} +int main() { + cout << "Merging dim cap " << kMergeDimCap << std::endl; + + runBAL135Benchmark(); + runBALBenchmark(); + runChainBenchmark(); return 0; } diff --git a/timing/timeSFMBAL.h b/timing/timeSFMBAL.h index f83cca9f80..74cc13e6e7 100644 --- a/timing/timeSFMBAL.h +++ b/timing/timeSFMBAL.h @@ -123,6 +123,16 @@ inline Ordering createSchurOrdering(const SfmData& db, return ordering; } +inline std::vector> createOrderings( + const SfmData& db, const GaussianFactorGraph& linear) { + return { + {"Burn", createSchurOrdering(db, false)}, + {"Metis", Ordering::Metis(linear)}, + {"Schur", createSchurOrdering(db, false)}, + {"Colamd", Ordering::Colamd(linear)}, + }; +} + // Create ordering and optimize int optimize(const SfmData& db, const NonlinearFactorGraph& graph, const Values& initial, bool separateCalibration = false) { @@ -132,8 +142,8 @@ int optimize(const SfmData& db, const NonlinearFactorGraph& graph, LevenbergMarquardtParams params; LevenbergMarquardtParams::SetCeresDefaults(¶ms); // params.setLinearSolverType("SEQUENTIAL_CHOLESKY"); - params.setVerbosityLM("SUMMARY"); - params.setRelativeErrorTol(0.01); // 1% relative error tol + params.setVerbosityLM("SUMMARY"); + params.setRelativeErrorTol(0.01); // 1% relative error tol if (gUseSchur) { // Create Schur-complement ordering