Cleanup

fonnesbeck · fonnesbeck · commit ae9ee590f4d8 · 2025-08-19T11:34:38.000-05:00
diff --git a/pymc_extras/inference/pathfinder/numba_dispatch.py b/pymc_extras/inference/pathfinder/numba_dispatch.py
@@ -13,34 +13,15 @@
 - Existing JAX dispatch in jax_dispatch.py
 """
 
-import numba
 import numpy as np
 import pytensor.tensor as pt
 
 from pytensor.graph import Apply, Op
 from pytensor.link.numba.dispatch import basic as numba_basic
 from pytensor.link.numba.dispatch import numba_funcify
 
-# Import existing ops for registration
 
-# Module version for tracking
-__version__ = "0.1.0"
-
-
-# NOTE: LogLike Op registration for Numba is intentionally removed
-#
-# The LogLike Op cannot be compiled with Numba due to fundamental incompatibility:
-# - LogLike uses arbitrary Python function closures (logp_func)
-# - Numba requires concrete, statically-typeable operations
-# - Function closures from PyTensor compilation cannot be analyzed by Numba
-#
-# Instead, the vectorized_logp module handles Numba mode by using scan-based
-# approaches that avoid LogLike Op entirely.
-#
-# This is documented as a known limitation in CLAUDE.md
-
-
-# @numba_funcify.register(LogLike)  # DISABLED - see note above
+# @numba_funcify.register(LogLike)  # DISABLED
 def _disabled_numba_funcify_LogLike(op, node, **kwargs):
     """DISABLED: LogLike Op registration for Numba.
 
@@ -59,7 +40,6 @@ def _disabled_numba_funcify_LogLike(op, node, **kwargs):
     )
 
 
-# Custom Op for Numba-compatible chi matrix computation
 class NumbaChiMatrixOp(Op):
     """Numba-optimized Chi matrix computation.
 
@@ -96,7 +76,7 @@ def make_node(self, diff):
             Computation node for chi matrix
         """
         diff = pt.as_tensor_variable(diff)
-        # Output shape: (L, N, J) - use None for dynamic dimensions
+
         output = pt.tensor(
             dtype=diff.dtype,
             shape=(None, None, self.J),  # Only J is static
@@ -118,21 +98,18 @@ def perform(self, node, inputs, outputs):
         outputs : list
             Output arrays [chi_matrix]
         """
-        diff = inputs[0]  # Shape: (L, N)
+        diff = inputs[0]
         L, N = diff.shape
         J = self.J
 
-        # Create output matrix
         chi_matrix = np.zeros((L, N, J), dtype=diff.dtype)
 
-        # Compute sliding window matrix (same logic as JAX version)
+        # Compute sliding window matrix
         for idx in range(L):
-            # For each row idx, we want the last J values of diff up to position idx
             start_idx = max(0, idx - J + 1)
             end_idx = idx + 1
 
-            # Get the relevant slice
-            relevant_diff = diff[start_idx:end_idx]  # Shape: (actual_length, N)
+            relevant_diff = diff[start_idx:end_idx]
             actual_length = end_idx - start_idx
 
             # If we have fewer than J values, pad with zeros at the beginning
@@ -142,8 +119,7 @@ def perform(self, node, inputs, outputs):
             else:
                 padded_diff = relevant_diff
 
-            # Assign to chi matrix
-            chi_matrix[idx] = padded_diff.T  # Transpose to get (N, J)
+            chi_matrix[idx] = padded_diff.T
 
         outputs[0][0] = chi_matrix
 
@@ -198,11 +174,9 @@ def chi_matrix_numba(diff):
 
         # Optimized sliding window with manual loop unrolling
         for batch_idx in range(L):
-            # Efficient window extraction
             start_idx = max(0, batch_idx - J + 1)
             window_size = min(J, batch_idx + 1)
 
-            # Direct memory copy for efficiency
             for j in range(window_size):
                 source_idx = start_idx + j
                 target_idx = J - window_size + j
@@ -214,7 +188,6 @@ def chi_matrix_numba(diff):
     return chi_matrix_numba
 
 
-# Custom Op for Numba-compatible BFGS sampling
 class NumbaBfgsSampleOp(Op):
     """Numba-optimized BFGS sampling with conditional logic.
 
@@ -262,7 +235,6 @@ def make_node(
         Apply
             Computation node with two outputs: phi and logdet
         """
-        # Convert all inputs to tensor variables (same as JAX version)
         inputs = [
             pt.as_tensor_variable(inp)
             for inp in [
@@ -278,10 +250,8 @@ def make_node(
             ]
         ]
 
-        # Output phi: shape (L, M, N) - same as u
         phi_out = pt.tensor(dtype=u.dtype, shape=(None, None, None))
 
-        # Output logdet: shape (L,) - same as first dimension of x
         logdet_out = pt.tensor(dtype=u.dtype, shape=(None,))
 
         return Apply(self, inputs, [phi_out, logdet_out])
@@ -299,20 +269,12 @@ def perform(self, node, inputs, outputs):
 
         x, g, alpha, beta, gamma, alpha_diag, inv_sqrt_alpha_diag, sqrt_alpha_diag, u = inputs
 
-        # Get shapes
         L, M, N = u.shape
         L, N, JJ = beta.shape
 
-        # Define the condition: use dense when JJ >= N, sparse otherwise
-        condition = JJ >= N
-
-        # Regularization term (from pathfinder.py REGULARISATION_TERM)
         REGULARISATION_TERM = 1e-8
 
-        if condition:
-            # Dense BFGS sampling branch
-
-            # Create identity matrix with regularization
+        if JJ >= N:
             IdN = np.eye(N)[None, ...]
             IdN = IdN + IdN * REGULARISATION_TERM
 
@@ -325,68 +287,49 @@ def perform(self, node, inputs, outputs):
                 @ inv_sqrt_alpha_diag
             )
 
-            # Full inverse Hessian
             H_inv = sqrt_alpha_diag @ (IdN + middle_term) @ sqrt_alpha_diag
 
-            # Cholesky decomposition (upper triangular)
             Lchol = np.array([cholesky(H_inv[i], lower=False) for i in range(L)])
 
-            # Compute log determinant from Cholesky diagonal
             logdet = 2.0 * np.sum(np.log(np.abs(np.diagonal(Lchol, axis1=-2, axis2=-1))), axis=-1)
 
-            # Compute mean: mu = x - H_inv @ g
             mu = x - np.sum(H_inv * g[..., None, :], axis=-1)
 
-            # Sample: phi = mu + Lchol @ u.T, then transpose back
             phi_transposed = mu[..., None] + Lchol @ np.transpose(u, axes=(0, 2, 1))
             phi = np.transpose(phi_transposed, axes=(0, 2, 1))
 
         else:
-            # Sparse BFGS sampling branch
-
-            # QR decomposition of qr_input = inv_sqrt_alpha_diag @ beta
+            # Sparse BFGS sampling
             qr_input = inv_sqrt_alpha_diag @ beta
 
-            # NumPy QR decomposition (applied along batch dimension)
-            Q = np.zeros((L, qr_input.shape[1], qr_input.shape[2]))  # (L, N, JJ)
-            R = np.zeros((L, qr_input.shape[2], qr_input.shape[2]))  # (L, JJ, JJ)
+            Q = np.zeros((L, qr_input.shape[1], qr_input.shape[2]))
+            R = np.zeros((L, qr_input.shape[2], qr_input.shape[2]))
             for i in range(L):
                 Q[i], R[i] = qr(qr_input[i], mode="economic")
 
-            # Identity matrix with regularization
             IdJJ = np.eye(R.shape[1])[None, ...]
             IdJJ = IdJJ + IdJJ * REGULARISATION_TERM
 
-            # Cholesky input: IdJJ + R @ gamma @ R.T
             Lchol_input = IdJJ + R @ gamma @ np.transpose(R, axes=(0, 2, 1))
 
-            # Cholesky decomposition (upper triangular)
             Lchol = np.array([cholesky(Lchol_input[i], lower=False) for i in range(L)])
 
-            # Compute log determinant: includes both Cholesky and alpha terms
             logdet_chol = 2.0 * np.sum(
                 np.log(np.abs(np.diagonal(Lchol, axis1=-2, axis2=-1))), axis=-1
             )
             logdet_alpha = np.sum(np.log(alpha), axis=-1)
             logdet = logdet_chol + logdet_alpha
 
-            # Compute inverse Hessian for sparse case: H_inv = alpha_diag + beta @ gamma @ beta.T
             H_inv = alpha_diag + (beta @ gamma @ np.transpose(beta, axes=(0, 2, 1)))
 
-            # Compute mean: mu = x - H_inv @ g
             mu = x - np.sum(H_inv * g[..., None, :], axis=-1)
 
-            # Complex sampling transformation for sparse case
-            # First part: Q @ (Lchol - IdJJ)
             Q_Lchol_diff = Q @ (Lchol - IdJJ)
 
-            # Second part: Q.T @ u.T
             Qt_u = np.transpose(Q, axes=(0, 2, 1)) @ np.transpose(u, axes=(0, 2, 1))
 
-            # Combine: (Q @ (Lchol - IdJJ)) @ (Q.T @ u.T) + u.T
             combined = Q_Lchol_diff @ Qt_u + np.transpose(u, axes=(0, 2, 1))
 
-            # Final transformation: mu + sqrt_alpha_diag @ combined
             phi_transposed = mu[..., None] + sqrt_alpha_diag @ combined
             phi = np.transpose(phi_transposed, axes=(0, 2, 1))
 
@@ -424,10 +367,9 @@ def numba_funcify_BfgsSampleOp(op, node, **kwargs):
         Numba-compiled function that performs conditional BFGS sampling
     """
 
-    # Regularization term constant
     REGULARISATION_TERM = 1e-8
 
-    @numba_basic.numba_njit(fastmath=True, parallel=True)
+    @numba_basic.numba_njit(fastmath=True, cache=True)
     def dense_bfgs_numba(
         x, g, alpha, beta, gamma, alpha_diag, inv_sqrt_alpha_diag, sqrt_alpha_diag, u
     ):
@@ -464,47 +406,37 @@ def dense_bfgs_numba(
         """
         L, M, N = u.shape
 
-        # Create identity matrix with regularization
         IdN = np.eye(N) + np.eye(N) * REGULARISATION_TERM
 
-        # Compute inverse Hessian using batched operations
         phi = np.empty((L, M, N), dtype=u.dtype)
         logdet = np.empty(L, dtype=u.dtype)
 
-        for batch_idx in numba.prange(L):  # Parallel over batch dimension
-            # Middle term computation for batch element batch_idx
-            # middle_term = inv_sqrt_alpha_diag @ beta @ gamma @ beta.T @ inv_sqrt_alpha_diag
-            beta_l = beta[batch_idx]  # (N, 2J)
-            gamma_l = gamma[batch_idx]  # (2J, 2J)
-            inv_sqrt_alpha_diag_l = inv_sqrt_alpha_diag[batch_idx]  # (N, N)
-            sqrt_alpha_diag_l = sqrt_alpha_diag[batch_idx]  # (N, N)
-
-            # Compute middle term step by step for efficiency
-            temp1 = inv_sqrt_alpha_diag_l @ beta_l  # (N, 2J)
-            temp2 = temp1 @ gamma_l  # (N, 2J)
-            temp3 = temp2 @ beta_l.T  # (N, N)
-            middle_term = temp3 @ inv_sqrt_alpha_diag_l  # (N, N)
-
-            # Full inverse Hessian: H_inv = sqrt_alpha_diag @ (IdN + middle_term) @ sqrt_alpha_diag
+        for l in range(L):  # noqa: E741
+            beta_l = beta[l]
+            gamma_l = gamma[l]
+            inv_sqrt_alpha_diag_l = inv_sqrt_alpha_diag[l]
+            sqrt_alpha_diag_l = sqrt_alpha_diag[l]
+
+            temp1 = inv_sqrt_alpha_diag_l @ beta_l
+            temp2 = temp1 @ gamma_l
+            temp3 = temp2 @ beta_l.T
+            middle_term = temp3 @ inv_sqrt_alpha_diag_l
+
             temp_matrix = IdN + middle_term
             H_inv_l = sqrt_alpha_diag_l @ temp_matrix @ sqrt_alpha_diag_l
 
-            # Cholesky decomposition (upper triangular)
             Lchol_l = np.linalg.cholesky(H_inv_l).T
 
-            # Log determinant from Cholesky diagonal
-            logdet[batch_idx] = 2.0 * np.sum(np.log(np.abs(np.diag(Lchol_l))))
+            logdet[l] = 2.0 * np.sum(np.log(np.abs(np.diag(Lchol_l))))
 
-            # Mean computation: mu = x - H_inv @ g
-            mu_l = x[batch_idx] - H_inv_l @ g[batch_idx]
+            mu_l = x[l] - H_inv_l @ g[l]
 
-            # Sample generation: phi = mu + Lchol @ u.T
             for m in range(M):
-                phi[batch_idx, m] = mu_l + Lchol_l @ u[batch_idx, m]
+                phi[l, m] = mu_l + Lchol_l @ u[l, m]
 
         return phi, logdet
 
-    @numba_basic.numba_njit(fastmath=True, parallel=True)
+    @numba_basic.numba_njit(fastmath=True, cache=True)
     def sparse_bfgs_numba(
         x, g, alpha, beta, gamma, alpha_diag, inv_sqrt_alpha_diag, sqrt_alpha_diag, u
     ):
@@ -545,38 +477,30 @@ def sparse_bfgs_numba(
         phi = np.empty((L, M, N), dtype=u.dtype)
         logdet = np.empty(L, dtype=u.dtype)
 
-        for batch_idx in numba.prange(L):  # Parallel over batch dimension
-            # QR decomposition of qr_input = inv_sqrt_alpha_diag @ beta
-            qr_input_l = inv_sqrt_alpha_diag[batch_idx] @ beta[batch_idx]
+        for l in range(L):  # noqa: E741
+            qr_input_l = inv_sqrt_alpha_diag[l] @ beta[l]
             Q_l, R_l = np.linalg.qr(qr_input_l)
 
-            # Identity matrix with regularization
             IdJJ = np.eye(JJ) + np.eye(JJ) * REGULARISATION_TERM
 
-            # Cholesky input: IdJJ + R @ gamma @ R.T
-            Lchol_input_l = IdJJ + R_l @ gamma[batch_idx] @ R_l.T
+            Lchol_input_l = IdJJ + R_l @ gamma[l] @ R_l.T
 
-            # Cholesky decomposition (upper triangular)
             Lchol_l = np.linalg.cholesky(Lchol_input_l).T
 
-            # Compute log determinant
             logdet_chol = 2.0 * np.sum(np.log(np.abs(np.diag(Lchol_l))))
-            logdet_alpha = np.sum(np.log(alpha[batch_idx]))
-            logdet[batch_idx] = logdet_chol + logdet_alpha
+            logdet_alpha = np.sum(np.log(alpha[l]))
+            logdet[l] = logdet_chol + logdet_alpha
 
-            # Inverse Hessian for sparse case
-            H_inv_l = alpha_diag[batch_idx] + beta[batch_idx] @ gamma[batch_idx] @ beta[batch_idx].T
+            H_inv_l = alpha_diag[l] + beta[l] @ gamma[l] @ beta[l].T
 
-            # Mean computation
-            mu_l = x[batch_idx] - H_inv_l @ g[batch_idx]
+            mu_l = x[l] - H_inv_l @ g[l]
 
-            # Complex sampling transformation for sparse case
             Q_Lchol_diff = Q_l @ (Lchol_l - IdJJ)
 
             for m in range(M):
-                Qt_u_lm = Q_l.T @ u[batch_idx, m]
-                combined = Q_Lchol_diff @ Qt_u_lm + u[batch_idx, m]
-                phi[batch_idx, m] = mu_l + sqrt_alpha_diag[batch_idx] @ combined
+                Qt_u_lm = Q_l.T @ u[l, m]
+                combined = Q_Lchol_diff @ Qt_u_lm + u[l, m]
+                phi[l, m] = mu_l + sqrt_alpha_diag[l] @ combined
 
         return phi, logdet
 
@@ -604,7 +528,6 @@ def bfgs_sample_numba(
         L, M, N = u.shape
         JJ = beta.shape[2]
 
-        # Numba-optimized conditional compilation
         if JJ >= N:
             return dense_bfgs_numba(
                 x, g, alpha, beta, gamma, alpha_diag, inv_sqrt_alpha_diag, sqrt_alpha_diag, u