switch to solves for Jbuild

brianz98 · brianz98 · commit ca71c00fe09a · 2026-03-12T17:30:07.000-04:00
diff --git a/docs/technical_notes/fock_build.tex b/docs/technical_notes/fock_build.tex
@@ -51,7 +51,7 @@ \section{Introduction}
 B^{P}_{\mu \nu} = \sum_Q (P|Q)^{-1/2} (Q|\mu \nu).
 \end{equation}
 
-The matrix inverse and inverse square root of the Coulomb metric $A\equiv (P|Q)$ can be computed by Cholesky decomposition:
+The matrix inverse square root of the Coulomb metric $A\equiv (P|Q)$ can be computed by Cholesky decomposition:
 \begin{equation}
 A = L L^T,
 \end{equation}
@@ -60,12 +60,7 @@ \section{Introduction}
 \label{eq:inv}
 A^{-1} = L^{-T} L^{-1},
 \end{equation}
-which can be obtained by two triangular solves (\textit{e.g.}, with \texttt{scipy.linalg.solve\_triangular}):
-\begin{equation}
-Lx = I, \quad L^T y = x
-\end{equation}
-where $x = L^{-1}$ and $y = A^{-1}$.
-From \cref{eq:inv}, we can also identify $L^{-1}$ as a valid choice for $A^{-1/2}$ in our context, since $L^{-1}_{PR}(R|\rho \sigma) = B^{P}_{\rho \sigma}$, and the contraction of the B tensors yields the original four-center integrals:
+From \cref{eq:inv}, we can identify $L^{-1}$ as a valid choice for $A^{-1/2}$ in our context, since $L^{-1}_{PR}(R|\rho \sigma) = B^{P}_{\rho \sigma}$, and the contraction of the B tensors yields the original four-center integrals:
 \begin{align}
 \sum_{P} B^{P}_{\mu \nu} B^{P}_{\rho \sigma} 
 & = \sum_{PQR}(\mu \nu | P) L^{-\text{T}}_{PQ} L^{-1}_{QR} (R|\rho \sigma) \\
@@ -109,6 +104,7 @@ \section{Coulomb matrix algorithm}
 
     \item Matrix vector multiplication
 \begin{equation}
+\label{eq:bp}
 b_P = \sum_{Q} (P|Q)^{-1} a_Q
 \end{equation}
     \item Second pass over all the three-center integrals
@@ -118,6 +114,23 @@ \section{Coulomb matrix algorithm}
 In this step, one can apply integral screening of $(\mu \nu | P)$ to reduce the evaluation cost.
 \end{enumerate}
 
+Note in \cref{eq:bp}, we require the application of the inverse of the Coulomb metric to a vector.
+There are at least four, in principle equivalent, ways to do this:
+\begin{enumerate}
+    \item Use the pre-computed $L^{-1}$ to form $A^{-1}=L^{-T} L^{-1}$ and do a matrix-vector product with the problem vector: $b_Q = A^{-1} a_Q$.
+    \item Two triangular solves with $L$ on the problem vector $a_Q$: first solve $L z = a_Q$ for $z$ and then solve $L^T b_P = z$ for $b_P$.
+    \item Compute the eigen-decomposition of the Coulomb metric $A = U s U^T$ and threshold the eigenvalues to get $A_{\eta}^{-1} = U_{\eta} s_{\eta}^{-1} U_{\eta}^T$ where $s_{\eta}$ are the eigenvalues above some threshold $\eta$ and $U_{\eta}$ are the corresponding eigenvectors. Then do a matrix-vector product with the problem vector: $b_P = A_{\eta}^{-1}a_Q$.
+    \item Compute the same $U_{\eta}$ and $s_{\eta}$ as above, but apply it to each problem vector without forming $A_{\eta}^{-1}$: $b_P = U_{\eta} (U_{\eta}^T a_Q / s_{\eta})$.
+\end{enumerate}
+The first two methods cannot deal with Coulomb metrics that have large condition numbers. The first and third methods at first seem appealing, since we pre-compute a `universal' $A^{-1}$ and just do matrix-vector products later.
+Turns out, forming of $A^{-1}$ directly, as in the first and third methods is not numerically stable, since the error of these operations is on the order of $\text{cond}(A)^2\epsilon$, whereas the error of the second and fourth methods is on the order of $\text{cond}(A)\epsilon$.
+
+If ill-conditioning is not a problem, then \texttt{scipy} offers a very succint way to carry out method 2:
+\begin{quote}
+    \texttt{bP = scipy.linalg.cho\_solve((L, True), aQ)}
+\end{quote}
+which does the two triangular solves with the Cholesky factor $L$ in one call.
+
 \section{Exchange matrix algorithm}
 
 The exchange matrix can be computed efficiently as
diff --git a/forte2/jkbuilder/jkbuilder.py b/forte2/jkbuilder/jkbuilder.py
@@ -11,6 +11,7 @@
     cholesky_wrapper,
     invsqrt_matrix,
     print_metric_info,
+    _eigh_metric_kernel,
 )
 
 
@@ -622,22 +623,27 @@ def _build_metric(self):
         # M = (P|Q)
         M = integrals.coulomb_2c(self.system, self.auxbasis)
         if self.metric_ortho_rtol is not None:
-            X, _, info = invsqrt_matrix(M, rtol=self.metric_ortho_rtol)
+            _precomp = _eigh_metric_kernel(M, rtol=self.metric_ortho_rtol)
+            self.Mm12, _, info = invsqrt_matrix(M, precomp=_precomp)
             print_metric_info(info, "Density fitting Coulomb metric (P|Q)")
-            self.Mm12 = X
-            self.Mm1 = np.einsum("PQ,QR->PR", X, X, optimize=True)
+            sevals = _precomp[0]
+            sevecs = _precomp[1]
+            ndiscard = _precomp[2]["n_discarded"]
+            self.sevals = sevals[ndiscard:]
+            self.sevecs = sevecs[:, ndiscard:]
         else:
             # M = L L.T
-            L = sp.linalg.cholesky(M, lower=True)
-            # M^{-1} = L^{-T} L^{-1}
-            # two triangular solves to get M^{-1}:
-            # 1. solve L Y = I for Y = L^{-1}
-            # 2. solve L.T X = Y for X = M^{-1}
+            self.L = sp.linalg.cholesky(M, lower=True)
             I = np.eye(M.shape[0])
-            Y = sp.linalg.solve_triangular(L, I, lower=True)
-            # M^{-1/2} = L^{-1}
-            self.Mm12 = Y
-            self.Mm1 = np.einsum("QP,QR->PR", Y, Y, optimize=True)
+            self.Mm12 = sp.linalg.solve_triangular(self.L, I, lower=True)
+
+    def _apply_Mm1(self, y):
+        """Compute x = (P|Q)^{-1} y without forming (P|Q)^{-1}"""
+        if self.metric_ortho_rtol is not None:
+            # x = U s^{-1} U^T y
+            return self.sevecs @ ((self.sevecs.T @ y) / self.sevals)
+        else:
+            return sp.linalg.cho_solve((self.L, True), y)
 
     def _find_aux_shell_block(self, pshell0):
         # find the block of auxiliary shells that fit in the buffer, starting from pshell0
@@ -679,7 +685,7 @@ def _J_kernel(self, D):
                 out=bP[pb0:pb1],
             )
             pshell0 = pshell1
-        bP = self.Mm1 @ bP
+        bP = self._apply_Mm1(bP)
 
         J = np.zeros_like(D)
         # 3. Batch over the (raw) P index again to build the J matrix
@@ -864,7 +870,7 @@ def _JK_kernel(self, C):
                 optimize=True,
             )
             if J_pass == 0:
-                bP = self.Mm1 @ bP
+                bP = self._apply_Mm1(bP)
             J_pass += 1
             i0 = i1
 
diff --git a/tests/jkbuilder/test_jkbuilder.py b/tests/jkbuilder/test_jkbuilder.py
@@ -257,3 +257,38 @@ def test_jkbuilder_on_the_fly_complex():
     [J_otf], [K_otf] = fb_otf.build_JK([Cocc])
     assert np.allclose(J_otf, J_ref), np.linalg.norm(J_otf - J_ref)
     assert np.allclose(K_otf, K_ref), np.linalg.norm(K_otf - K_ref)
+
+
+def test_jkbuilder_on_the_fly_large():
+    xyz = """
+    Cr 0.0 0.0 0.0
+    Cr 0.0 0.0 2.0
+    """
+
+    system = System(
+        xyz=xyz,
+        basis_set="cc-pvtz",
+        auxiliary_basis_set="cc-pvtz-autoaux",
+        df_ortho_rtol=1e-8,
+    )
+
+    nmo = system.nbf
+    rng = np.random.default_rng(12345)
+    Cocc = rng.standard_normal((nmo, 24))
+    D = [Cocc @ Cocc.T.conj()]
+
+    fb = system.fock_builder
+    J_ref = fb.build_J(D)
+    K_ref = fb.build_K([Cocc])
+
+    fb_otf = jkbuilder.FockBuilderOTF(system, memory_threshold_mb=100)
+    J_otf = fb_otf.build_J(D)[0]
+    K_otf = fb_otf.build_K([Cocc])[0]
+
+    assert np.linalg.norm(J_otf - J_ref) < 1e-8
+    assert np.linalg.norm(K_otf - K_ref) < 1e-8
+
+    # separately test the combined JK builder, since the algorithm is different for the combined builder
+    J_otf, K_otf = fb_otf.build_JK([Cocc])
+    assert np.linalg.norm(J_otf[0] - J_ref[0]) < 1e-8
+    assert np.linalg.norm(K_otf[0] - K_ref[0]) < 1e-8
diff --git a/tests/scf/test_rhf.py b/tests/scf/test_rhf.py
@@ -108,3 +108,20 @@ def test_rhf_cl2_otf():
     scf.run()
 
     assert scf.E == approx(-918.943349338796)
+
+def test_rhf_cr2_autoaux_otf():
+    xyz = """
+    Cr 0 0 0
+    Cr 0 0 1.681
+    """
+    system = System(
+        xyz=xyz,
+        basis_set="cc-pvtz",
+        auxiliary_basis_set="cc-pvtz-autoaux",
+        memory_threshold_mb=130,
+    )
+    scf = RHF(charge=0)(system)
+    scf.run()
+
+    assert scf.E == approx(-2085.926511540127)
+test_rhf_cr2_autoaux_otf()