Further speedup to CSR back-substitution using dof_simplenum.

yuvaltassa · copybara-github · commit 4510c6d29031 · 2025-01-02T09:04:51.000-08:00
PiperOrigin-RevId: 711440268
Change-Id: I81cd9a6a8b8ec78d08cfbc34f60a9216ea753833
diff --git a/src/engine/engine_core_smooth.c b/src/engine/engine_core_smooth.c
@@ -1575,15 +1575,19 @@ void mj_solveLD(const mjModel* m, mjtNum* restrict x, int n,
 // in-place sparse backsubstitution:  x = inv(L'*D*L)*x
 //  like mj_solveLD, but using the CSR representation of L
 void mj_solveLDs(mjtNum* restrict x, const mjtNum* qLDs, const mjtNum* qLDiagInv, int nv,
-                 const int* rownnz, const int* rowadr, const int* diag, const int* colind) {
+                 const int* rownnz, const int* rowadr, const int* diagind, const int* diagnum,
+                 const int* colind) {
   // x <- L^-T x
   for (int i=nv-2; i >= 0; i--) {
-    int d1 = diag[i] + 1;
-    int nnz = rownnz[i] - d1;
-    if (nnz > 0) {
-      int adr = rowadr[i] + d1;
-      x[i] -= mju_dotSparse(qLDs+adr, x, nnz, colind+adr, /*flg_unc1=*/0);
+    // skip diagonal (simple) rows
+    if (diagnum[i]) {
+      continue;
     }
+
+    int d1 = diagind[i] + 1;
+    int nnz = rownnz[i] - d1;
+    int adr = rowadr[i] + d1;
+    x[i] -= mju_dotSparse(qLDs+adr, x, nnz, colind+adr, /*flg_unc1=*/0);
   }
 
   // x(i) /= D(i,i)
@@ -1593,11 +1597,14 @@ void mj_solveLDs(mjtNum* restrict x, const mjtNum* qLDs, const mjtNum* qLDiagInv
 
   // x <- L^-1 x
   for (int i=1; i < nv; i++) {
-    int d = diag[i];
-    if (d > 0) {
-      int adr = rowadr[i];
-      x[i] -= mju_dotSparse(qLDs+adr, x, d, colind+adr, /*flg_unc1=*/0);
+    // skip diagonal (simple) rows
+    if (diagnum[i]) {
+      i += diagnum[i] - 1;  // when iterating forward we can skip ahead
+      continue;
     }
+
+    int adr = rowadr[i];
+    x[i] -= mju_dotSparse(qLDs+adr, x, diagind[i], colind+adr, /*flg_unc1=*/0);
   }
 }
 
diff --git a/src/engine/engine_core_smooth.h b/src/engine/engine_core_smooth.h
@@ -61,7 +61,8 @@ MJAPI void mj_solveLD(const mjModel* m, mjtNum* x, int n,
 // in-place sparse backsubstitution:  x = inv(L'*D*L)*x
 //  like mj_solveLD, but using the CSR representation of L
 MJAPI void mj_solveLDs(mjtNum* x, const mjtNum* qLDs, const mjtNum* qLDiagInv, int nv,
-                       const int* rownnz, const int* rowadr, const int* diag, const int* colind);
+                       const int* rownnz, const int* rowadr, const int* diagind, const int* diagnum,
+                       const int* colind);
 
 // sparse backsubstitution:  x = inv(L'*D*L)*y, use factorization in d
 MJAPI void mj_solveM(const mjModel* m, mjData* d, mjtNum* x, const mjtNum* y, int n);
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
@@ -44,7 +44,7 @@ mujoco_test(
 )
 
 mujoco_test(
-  engine_core_smooth_benchmark_test
+  solveLD_benchmark_test
   MAIN_TARGET benchmark::benchmark_main
   ADDITIONAL_LINK_LIBRARIES benchmark::benchmark absl::core_headers
 )
diff --git a/test/benchmark/solveLD_benchmark_test.cc b/test/benchmark/solveLD_benchmark_test.cc
@@ -64,7 +64,8 @@ static void BM_solveLD(benchmark::State& state, bool featherstone, bool coil) {
       } else {
         mju_copy(res, vec, m->nv);
         mj_solveLDs(res, LDs, d->qLDiagInv, m->nv,
-                    d->C_rownnz, d->C_rowadr, d->C_diag, d->C_colind);
+                    d->C_rownnz, d->C_rowadr, d->C_diag, m->dof_simplenum,
+                    d->C_colind);
       }
     }
   }
diff --git a/test/engine/engine_core_smooth_test.cc b/test/engine/engine_core_smooth_test.cc
@@ -456,7 +456,54 @@ TEST_F(CoreSmoothTest, FactorI) {
   mj_deleteModel(model);
 }
 
-TEST_F(CoreSmoothTest, SolveLD2) {
+// in-place sparse backsubstitution:  x = inv(L'*D*L)*x
+//  like mj_solveLD, but using the CSR representation of L
+//  variant that only uses the lower triangle of qLDs
+static void mj_solveLDsLower(mjtNum* x, const mjtNum* qLDs,
+                             const mjtNum* qLDiagInv, int nv, const int* rownnz,
+                             const int* rowadr, const int* diagind,
+                             const int* diagnum, const int* colind,
+                             int* scratch) {
+  int* marker = scratch;
+  for (int i=1; i < nv; i++) {
+    marker[i] = rowadr[i] + diagind[i] - 1;
+  }
+
+  // x <- L^-T x
+  for (int i=nv-2; i >= 0; i--) {
+    // skip diagonal (simple) rows
+    if (diagnum[i]) {
+      continue;
+    }
+
+    for (int j=i+1; j < nv; j++) {
+      if (colind[marker[j]] == i) {
+        x[i] -= qLDs[marker[j]--] * x[j];
+      }
+    }
+  }
+
+  // x(i) /= D(i,i)
+  for (int i=0; i < nv; i++) {
+    x[i] *= qLDiagInv[i];
+  }
+
+  // x <- L^-1 x
+  for (int i=1; i < nv; i++) {
+    // skip diagonal (simple) rows
+    if (diagnum[i]) {
+      i += diagnum[i] - 1;  // when iterating forward we can skip ahead
+      continue;
+    }
+
+    int d = diagind[i];
+    int adr = rowadr[i];
+    x[i] -= mju_dotSparse(qLDs+adr, x, d, colind+adr, /*flg_unc1=*/0);
+  }
+}
+
+
+TEST_F(CoreSmoothTest, SolveLDs) {
   const std::string xml_path = GetTestDataFilePath(kInertiaPath);
   char error[1024];
   mjModel* m = mj_loadXML(xml_path.c_str(), nullptr, error, sizeof(error));
@@ -490,9 +537,23 @@ TEST_F(CoreSmoothTest, SolveLD2) {
   for (int i=0; i < nv; i++) vec[i] = vec2[i] = 20 + 30*i;
   for (int i=0; i < nv; i+=2) vec[i] = vec2[i] = 0;
 
+  // use upper triangle
   mj_solveLD(m, vec.data(), 1, d->qLD, d->qLDiagInv);
   mj_solveLDs(vec2.data(), LDs.data(), d->qLDiagInv, nv,
-              d->C_rownnz, d->C_rowadr, d->C_diag, d->C_colind);
+              d->C_rownnz, d->C_rowadr, d->C_diag, m->dof_simplenum,
+              d->C_colind);
+
+  // expect vectors to match up to floating point precision
+  for (int i=0; i < nv; i++) {
+    EXPECT_FLOAT_EQ(vec[i], vec2[i]);
+  }
+
+  // don't use use upper triangle
+  mj_solveLD(m, vec.data(), 1, d->qLD, d->qLDiagInv);
+  vector<int> scratch(nv);
+  mj_solveLDsLower(vec2.data(), LDs.data(), d->qLDiagInv, nv, d->C_rownnz,
+                   d->C_rowadr, d->C_diag, m->dof_simplenum, d->C_colind,
+                   scratch.data());
 
   // expect vectors to match up to floating point precision
   for (int i=0; i < nv; i++) {

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ mujoco_test(`
`44`	`44`	`)`
`45`	`45`
`46`	`46`	`mujoco_test(`
`47`		`- engine_core_smooth_benchmark_test`
	`47`	`+ solveLD_benchmark_test`
`48`	`48`	`MAIN_TARGET benchmark::benchmark_main`
`49`	`49`	`ADDITIONAL_LINK_LIBRARIES benchmark::benchmark absl::core_headers`
`50`	`50`	`)`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,8 @@ static void BM_solveLD(benchmark::State& state, bool featherstone, bool coil) {`
`64`	`64`	`} else {`
`65`	`65`	`mju_copy(res, vec, m->nv);`
`66`	`66`	`mj_solveLDs(res, LDs, d->qLDiagInv, m->nv,`
`67`		`- d->C_rownnz, d->C_rowadr, d->C_diag, d->C_colind);`
	`67`	`+ d->C_rownnz, d->C_rowadr, d->C_diag, m->dof_simplenum,`
	`68`	`+ d->C_colind);`
`68`	`69`	`}`
`69`	`70`	`}`
`70`	`71`	`}`