feat: Metal_Dense uses Accelerate dense LU baseline (sgetrf/sgetrs)

robtaylor · robtaylor · commit 25d4ffa56e73 · 2026-03-16T15:03:51.000Z
Replace BaSpaCho supernodal Metal_Dense (impractical with 25K per-lump
dispatches) with standalone Accelerate BLAS dense LU. Uses sgetrf for
factorization and sgetrs for solve, with mixed-precision iterative
refinement (float factor, double SpMV residual).

C6288 (n=25380): factor ~10.3s, solve ~0.31s, residual ~1e-11.
Demonstrates 880x benefit of sparse exploitation (Metal_Sparse: ~12ms).
Size guard skips matrices &gt;50K (dense n² would exceed memory).

Also adds sgetrs/dgetrs declarations + LAPACKE wrappers to BlasDefs.h.

Co-developed-by: Claude Code v2.1.58 (claude-opus-4-6)
diff --git a/baspacho/baspacho/BlasDefs.h b/baspacho/baspacho/BlasDefs.h
@@ -59,6 +59,14 @@ void dgetrf_(const BLAS_INT* m, const BLAS_INT* n, double* A, const BLAS_INT* ld
              BLAS_INT* info);
 void sgetrf_(const BLAS_INT* m, const BLAS_INT* n, float* A, const BLAS_INT* lda, BLAS_INT* ipiv,
              BLAS_INT* info);
+
+// LU solve (getrs) — solve A*X = B given LU factorization from getrf
+void dgetrs_(const char* trans, const BLAS_INT* n, const BLAS_INT* nrhs, const double* A,
+             const BLAS_INT* lda, const BLAS_INT* ipiv, double* B, const BLAS_INT* ldb,
+             BLAS_INT* info);
+void sgetrs_(const char* trans, const BLAS_INT* n, const BLAS_INT* nrhs, const float* A,
+             const BLAS_INT* lda, const BLAS_INT* ipiv, float* B, const BLAS_INT* ldb,
+             BLAS_INT* info);
 }
 
 #define CBLAS_LAYOUT int
@@ -169,4 +177,19 @@ inline BLAS_INT LAPACKE_sgetrf(int /* matrix_layout */, BLAS_INT m, BLAS_INT n,
   return info;
 }
 
+// LU solve wrappers
+inline BLAS_INT LAPACKE_dgetrs(char trans, BLAS_INT n, BLAS_INT nrhs, const double* a,
+                               BLAS_INT lda, const BLAS_INT* ipiv, double* b, BLAS_INT ldb) {
+  BLAS_INT info;
+  dgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, &info);
+  return info;
+}
+
+inline BLAS_INT LAPACKE_sgetrs(char trans, BLAS_INT n, BLAS_INT nrhs, const float* a,
+                               BLAS_INT lda, const BLAS_INT* ipiv, float* b, BLAS_INT ldb) {
+  BLAS_INT info;
+  sgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, &info);
+  return info;
+}
+
 }  // end namespace BaSpaCho
diff --git a/baspacho/benchmarking/LUBench.cpp b/baspacho/benchmarking/LUBench.cpp
@@ -40,6 +40,10 @@
 #include "baspacho/baspacho/MetalDefs.h"
 #endif
 
+#ifdef BASPACHO_USE_BLAS
+#include "baspacho/baspacho/BlasDefs.h"
+#endif
+
 using namespace BaSpaCho;
 using namespace BaSpaCho::testing_utils;
 using namespace std;
@@ -248,15 +252,11 @@ static vector<LUTimingResult> benchmarkLUCpu(
 // ============================================================================
 
 #ifdef BASPACHO_USE_METAL
-// Metal LU benchmark: GPU sparse elimination + CPU BLAS dense + CPU SpMV refinement.
+// Metal LU benchmark (Metal_Sparse): GPU sparse elimination + CPU BLAS dense + CPU SpMV refinement.
 // Mirrors the spineax BaspachoGpuInstantiate/Execute FFI code path.
 // Uses persistent contexts, device-resident pivots, recording pass, and external encoder.
-//
-// useSparseElim=true (Metal_Sparse): GPU sparse elim for scalar lumps, CPU dense for rest.
-// useSparseElim=false (Metal_Dense): all-dense LU on GPU (no sparse elimination).
 static vector<LUTimingResult> benchmarkLUMetalFFI(
-    const vector<pair<CsrMatrix, Eigen::VectorXd>>& matrices, int maxRefine,
-    bool verbose, bool useSparseElim = true) {
+    const vector<pair<CsrMatrix, Eigen::VectorXd>>& matrices, int maxRefine, bool verbose) {
   if (matrices.empty()) return {};
 
   bool capturing = MetalContext::instance().beginCaptureIfRequested("/tmp/baspacho_ffi.gputrace");
@@ -313,7 +313,7 @@ static vector<LUTimingResult> benchmarkLUMetalFFI(
   settings.matrixType = MTYPE_GENERAL;
   settings.numThreads = 1;
   settings.staticPivotThreshold = pivotThreshold;
-  settings.findSparseEliminationRanges = useSparseElim;
+  settings.findSparseEliminationRanges = true;
 
   vector<int64_t> paramSizes(n, 1);
   vector<int64_t> blockSizes(n, 1);
@@ -534,6 +534,132 @@ static vector<LUTimingResult> benchmarkLUMetalFFI(
 }
 #endif  // BASPACHO_USE_METAL
 
+// ============================================================================
+// Dense BLAS LU baseline (Accelerate sgetrf/sgetrs, no sparsity exploitation)
+// ============================================================================
+
+#ifdef BASPACHO_USE_BLAS
+static vector<LUTimingResult> benchmarkLUDenseBLAS(
+    const vector<pair<CsrMatrix, Eigen::VectorXd>>& matrices, int maxRefine, bool verbose) {
+  if (matrices.empty()) return {};
+
+  const CsrMatrix& A0 = matrices[0].first;
+  int64_t n = A0.nRows;
+
+  // Size guard: dense n×n float = n² × 4 bytes. n=50000 → ~10GB.
+  if (n > 50000) {
+    if (verbose)
+      cout << "  [DenseBLAS] Skipping: n=" << n << " too large for dense (" << (n * n * 4.0 / 1e9)
+           << " GB)" << endl;
+    return {};
+  }
+
+  // Preprocessing: BTF max transversal (once per pattern)
+  auto preproc = computeMaxTransversal(n, A0.rowPtr.data(), A0.colInd.data());
+
+  vector<LUTimingResult> results;
+  vector<int64_t> pRowPtr, pColInd;
+  vector<double> pValues;
+
+  // Dense matrix (column-major for LAPACK) + pivot array
+  vector<float> dense(n * n, 0.0f);
+  vector<BLAS_INT> ipiv(n);
+
+  for (size_t mi = 0; mi < matrices.size(); mi++) {
+    const CsrMatrix& A = matrices[mi].first;
+    const Eigen::VectorXd& b = matrices[mi].second;
+    LUTimingResult res;
+
+    // Equilibration (same as Metal_Sparse)
+    applyRowPermToCsr<double>(n, A.rowPtr.data(), A.colInd.data(), A.values.data(),
+                              preproc.rowPerm.data(), pRowPtr, pColInd, pValues);
+    vector<double> rowScale, colScale;
+    computeEquilibration(n, pRowPtr.data(), pColInd.data(), pValues.data(), rowScale, colScale);
+
+    applyRowPermAndScaleToCsr<double>(n, A.rowPtr.data(), A.colInd.data(), A.values.data(),
+                                     preproc.rowPerm.data(), rowScale.data(), colScale.data(),
+                                     pRowPtr, pColInd, pValues);
+
+    // Scatter equilibrated sparse CSR → dense column-major: dense[col * n + row] = value
+    fill(dense.begin(), dense.end(), 0.0f);
+    for (int64_t i = 0; i < n; i++) {
+      for (int64_t k = pRowPtr[i]; k < pRowPtr[i + 1]; k++) {
+        dense[pColInd[k] * n + i] = float(pValues[k]);
+      }
+    }
+
+    // Factor: sgetrf
+    auto tFactor = Clock::now();
+    BLAS_INT N = static_cast<BLAS_INT>(n);
+    BLAS_INT info = LAPACKE_sgetrf(LAPACK_COL_MAJOR, N, N, dense.data(), N, ipiv.data());
+    res.factorTime = tdelta(Clock::now() - tFactor).count();
+
+    if (info != 0 && verbose) {
+      cout << "  [DenseBLAS] Matrix #" << mi << ": sgetrf info=" << info << endl;
+    }
+
+    // Solve + iterative refinement
+    auto tSolve = Clock::now();
+
+    // Initial solve: permute RHS, sgetrs, unscale
+    vector<float> rhsF(n);
+    for (int64_t j = 0; j < n; j++)
+      rhsF[j] = float(rowScale[j] * b(preproc.rowPerm[j]));
+
+    char trans = 'N';
+    BLAS_INT nrhs = 1;
+    LAPACKE_sgetrs(trans, N, nrhs, dense.data(), N, ipiv.data(), rhsF.data(), N);
+
+    // Accumulate solution in double precision
+    Eigen::VectorXd x(n);
+    for (int64_t j = 0; j < n; j++)
+      x(j) = colScale[j] * double(rhsF[j]);
+
+    double residual = computeResidualDouble(A, x, b);
+    res.refineSteps = 0;
+
+    // Iterative refinement: CPU SpMV (double) → sgetrs (float) → accumulate (double)
+    for (int iter = 0; iter < maxRefine && residual > 1e-10; iter++) {
+      // SpMV residual in double precision
+      Eigen::VectorXd r = Eigen::VectorXd::Zero(n);
+      for (int64_t i = 0; i < n; i++) {
+        for (int64_t k = A.rowPtr[i]; k < A.rowPtr[i + 1]; k++)
+          r(i) += A.values[k] * x(A.colInd[k]);
+      }
+      r = b - r;
+
+      // Permute + scale residual → float RHS
+      for (int64_t j = 0; j < n; j++)
+        rhsF[j] = float(rowScale[j] * r(preproc.rowPerm[j]));
+
+      // Solve correction
+      LAPACKE_sgetrs(trans, N, nrhs, dense.data(), N, ipiv.data(), rhsF.data(), N);
+
+      // Accumulate correction in double
+      for (int64_t j = 0; j < n; j++)
+        x(j) += colScale[j] * double(rhsF[j]);
+
+      residual = computeResidualDouble(A, x, b);
+      res.refineSteps++;
+    }
+
+    res.solveTime = tdelta(Clock::now() - tSolve).count();
+    res.residual = residual;
+    res.perturbCount = 0;
+
+    if (verbose) {
+      cout << "  [DenseBLAS] Matrix #" << mi << ": factor=" << fixed << setprecision(4)
+           << res.factorTime << "s, solve=" << res.solveTime << "s, residual=" << scientific
+           << setprecision(2) << res.residual << ", refine=" << res.refineSteps << endl;
+    }
+
+    results.push_back(res);
+  }
+
+  return results;
+}
+#endif  // BASPACHO_USE_BLAS
+
 // ============================================================================
 // CUDA (double) benchmark
 // ============================================================================
@@ -936,7 +1062,7 @@ void help() {
        << "  BaSpaCho_LU_CPU\n"
 #ifdef BASPACHO_USE_METAL
        << "  Metal_Sparse     (GPU sparse elim + CPU BLAS dense + CPU SpMV refinement)\n"
-       << "  Metal_Dense      (all-dense GPU LU, no sparse elimination)\n"
+       << "  Metal_Dense      (Accelerate dense LU baseline, no sparsity exploitation)\n"
 #endif
 #ifdef BASPACHO_USE_CUBLAS
        << "  BaSpaCho_LU_CUDA\n"
@@ -1144,15 +1270,20 @@ int main(int argc, char* argv[]) {
 #ifdef BASPACHO_USE_METAL
   if (regex_search(string("Metal_Sparse"), selectSolvers)) {
     if (!jsonOutput) cout << "\nRunning Metal_Sparse..." << endl;
-    auto timings = benchmarkLUMetalFFI(matrices, maxRefineIters, verbose, true);
+    auto timings = benchmarkLUMetalFFI(matrices, maxRefineIters, verbose);
     if (isWarmup && timings.size() > 1) timings.erase(timings.begin());
     resultToRecords(problemName, "Metal_Sparse", timings, allRecords);
     if (!jsonOutput) printResults("Metal_Sparse", timings);
   }
 
   if (regex_search(string("Metal_Dense"), selectSolvers)) {
     if (!jsonOutput) cout << "\nRunning Metal_Dense..." << endl;
-    auto timings = benchmarkLUMetalFFI(matrices, maxRefineIters, verbose, false);
+#ifdef BASPACHO_USE_BLAS
+    auto timings = benchmarkLUDenseBLAS(matrices, maxRefineIters, verbose);
+#else
+    vector<LUTimingResult> timings;
+    if (!jsonOutput) cout << "  [Metal_Dense] Skipping: BLAS not available" << endl;
+#endif
     if (isWarmup && timings.size() > 1) timings.erase(timings.begin());
     resultToRecords(problemName, "Metal_Dense", timings, allRecords);
     if (!jsonOutput) printResults("Metal_Dense", timings);