Skip to content

Commit 45b0153

Browse files
yuvaltassacopybara-github
authored andcommitted
Refactor sparse Cholesky factorization into symbolic and numeric phases.
The new symbolic function is a generalization of the function it replaces. In this CL it takes two unused temp arrays. The actual change in behavior happens in the followup. New benchmark test output below ("L" is 2 humanoids and 100 free objects, "XL" is 100 humanoids). Note that `symbolic` is only ever called once per Newton iteration, while `numeric` is sometimes called multiple times (when the rank-1 update fails), hence timing them separately is valuable. ``` Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------------------------- BM_old_L_mean 84382 84703 19547 11.807k items/s BM_symbolic_L_mean 16345 16381 88414 61.055k items/s BM_numeric_L_mean 10986 10994 120000 90.999k items/s BM_old_XL_mean 1241208 1244212 1200 803.924 items/s BM_symbolic_XL_mean 130917 131042 12720 7.631k items/s BM_numeric_XL_mean 77004 76767 21116 13.029k items/s ``` PiperOrigin-RevId: 846704054 Change-Id: Ib0c365724d63bf2b81606ca5353756a6496c3a26
1 parent d1fd11b commit 45b0153

File tree

7 files changed

+12312
-99
lines changed

7 files changed

+12312
-99
lines changed

src/engine/engine_solver.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,16 +1558,14 @@ static void MakeHessian(mjData* d, mjCGContext* ctx) {
15581558
HT_rownnz, HT_rowadr, HT_colind, NULL,
15591559
ctx->H_rownnz, ctx->H_rowadr, ctx->H_colind);
15601560

1561-
// count total and row non-zeros of reverse-Cholesky factor L
1562-
ctx->nL = mju_cholFactorCount(ctx->L_rownnz, HT_rownnz, HT_rowadr, HT_colind, nv, d);
1561+
// count total and row non-zeros of reverse-Cholesky factors L and LT
1562+
int* LT_rownnz_temp = mjSTACKALLOC(d, nv, int);
1563+
int* LT_rowadr_temp = mjSTACKALLOC(d, nv, int);
1564+
ctx->nL = mju_cholFactorSymbolic(NULL, ctx->L_rownnz, ctx->L_rowadr, NULL,
1565+
LT_rownnz_temp, LT_rowadr_temp, NULL,
1566+
HT_rownnz, HT_rowadr, HT_colind, nv, d);
15631567
mj_freeStack(d);
15641568

1565-
// compute L row addresses: rowadr = cumsum(rownnz)
1566-
ctx->L_rowadr[0] = 0;
1567-
for (int r=1; r < nv; r++) {
1568-
ctx->L_rowadr[r] = ctx->L_rowadr[r-1] + ctx->L_rownnz[r-1];
1569-
}
1570-
15711569
// allocate L_colind, L, Lcone
15721570
ctx->L_colind = mjSTACKALLOC(d, ctx->nL, int);
15731571
ctx->L = mjSTACKALLOC(d, ctx->nL, mjtNum);

src/engine/engine_util_solve.c

Lines changed: 152 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -187,21 +187,52 @@ int mju_cholFactorSparse(mjtNum* mat, int n, mjtNum mindiag,
187187
return rank;
188188
}
189189

190-
191-
// precount row non-zeros of reverse-Cholesky factor L, return total non-zeros
192-
// based on ldl_symbolic from 'Algorithm 8xx: a concise sparse Cholesky factorization package'
193-
// reads pattern from upper triangle
194-
int mju_cholFactorCount(int* L_rownnz, const int* rownnz, const int* rowadr, const int* colind,
195-
int n, mjData* d) {
190+
// symbolic reverse-Cholesky: compute both L (CSR) and LT (CSC) structures
191+
// if L_colind is NULL, perform counting logic (fill rownnz/rowadr arrays and return total nnz)
192+
// if L_colind is not NULL, assume rownnz/rowadr are precomputed and fill colind/map arrays
193+
// reads pattern from upper triangle
194+
// based on ldl_symbolic from 'Algorithm 8xx: a concise sparse Cholesky factorization package'
195+
int mju_cholFactorSymbolic(int* restrict L_colind, int* restrict L_rownnz, int* restrict L_rowadr,
196+
int* restrict LT_colind, int* restrict LT_rownnz,
197+
int* restrict LT_rowadr, int* restrict LT_map,
198+
const int* rownnz, const int* rowadr, const int* colind, int n,
199+
mjData* d) {
196200
mj_markStack(d);
197-
int* parent = mjSTACKALLOC(d, n, int);
198-
int* flag = mjSTACKALLOC(d, n, int);
201+
int* restrict parent = mjSTACKALLOC(d, n, int);
202+
int* restrict flag = mjSTACKALLOC(d, n, int);
203+
int* restrict cursor = NULL;
204+
int* LT_write = NULL;
205+
206+
// filling phase: initialize write positions
207+
if (L_colind) {
208+
cursor = mjSTACKALLOC(d, n, int);
209+
LT_write = mjSTACKALLOC(d, n, int);
210+
for (int r = 0; r < n; r++) {
211+
cursor[r] = L_rowadr[r] + L_rownnz[r] - 2; // end of row r (before diagonal)
212+
LT_write[r] = LT_rowadr[r]; // start of LT row r
213+
}
214+
}
199215

200216
// loop over rows in reverse order
201217
for (int r = n - 1; r >= 0; r--) {
202218
parent[r] = -1;
203219
flag[r] = r;
204-
L_rownnz[r] = 1; // start with 1 for diagonal
220+
221+
// counting phase: start with 1 for diagonal
222+
if (!L_colind) {
223+
L_rownnz[r] = 1;
224+
LT_rownnz[r] = 1;
225+
}
226+
227+
// filling phase: write diagonals
228+
else {
229+
int diag_idx = L_rowadr[r] + L_rownnz[r] - 1;
230+
L_colind[diag_idx] = r;
231+
int write_idx = LT_write[r];
232+
LT_colind[write_idx] = r;
233+
LT_map[write_idx] = diag_idx;
234+
LT_write[r]++;
235+
}
205236

206237
// loop over non-zero columns of upper triangle
207238
int start = rowadr[r];
@@ -221,8 +252,23 @@ int mju_cholFactorCount(int* L_rownnz, const int* rownnz, const int* rowadr, con
221252
parent[i] = r;
222253
}
223254

224-
// increment non-zeros, flag row i, advance to parent
225-
L_rownnz[i]++;
255+
// counting phase: increment non-zeros
256+
if (!L_colind) {
257+
L_rownnz[i]++;
258+
LT_rownnz[r]++;
259+
}
260+
261+
// filling phase: write L[i, r] and LT[r, i]
262+
else {
263+
int L_idx = cursor[i];
264+
cursor[i]--;
265+
L_colind[L_idx] = r;
266+
LT_colind[LT_write[r]] = i;
267+
LT_map[LT_write[r]] = L_idx;
268+
LT_write[r]++;
269+
}
270+
271+
// flag row i, advance to parent
226272
flag[i] = r;
227273
i = parent[i];
228274
}
@@ -231,15 +277,98 @@ int mju_cholFactorCount(int* L_rownnz, const int* rownnz, const int* rowadr, con
231277

232278
mj_freeStack(d);
233279

234-
// sum up all row non-zeros
280+
// counting phase: compute row addresses, add up total non-zeros
235281
int nnz = 0;
236-
for (int r = 0; r < n; r++) {
237-
nnz += L_rownnz[r];
282+
if (!L_colind) {
283+
nnz = L_rownnz[0];
284+
L_rowadr[0] = 0;
285+
LT_rowadr[0] = 0;
286+
for (int r = 1; r < n; r++) {
287+
L_rowadr[r] = L_rowadr[r - 1] + L_rownnz[r - 1];
288+
LT_rowadr[r] = LT_rowadr[r - 1] + LT_rownnz[r - 1];
289+
nnz += L_rownnz[r];
290+
}
238291
}
239292

240293
return nnz;
241294
}
242295

296+
// numeric reverse-Cholesky: compute L values given fixed sparsity pattern, returns rank
297+
// L_colind must already contain the correct sparsity pattern (from mju_cholFactorSymbolic)
298+
// LT_map[k] gives index in L for LT_colind[k]
299+
int mju_cholFactorNumeric(mjtNum* restrict L, int n, mjtNum mindiag,
300+
const int* L_rownnz, const int* L_rowadr, const int* L_colind,
301+
const int* LT_rownnz, const int* LT_rowadr, const int* LT_colind,
302+
const int* LT_map, const mjtNum* H,
303+
const int* H_rownnz, const int* H_rowadr, const int* H_colind,
304+
mjData* d) {
305+
int rank = n;
306+
307+
// single-row dense accumulator
308+
mj_markStack(d);
309+
mjtNum* restrict dense = mjSTACKALLOC(d, n, mjtNum);
310+
mju_zero(dense, n);
311+
312+
// backpass over rows
313+
for (int r = n - 1; r >= 0; r--) {
314+
// scatter H[r, 0:r] into dense
315+
mju_scatter(dense, H + H_rowadr[r], H_colind + H_rowadr[r], H_rownnz[r]);
316+
317+
// accumulate updates from rows c > r where L[c,r] != 0
318+
// use CSC transpose: LT column r contains rows that have column r
319+
// start from k=1 to skip the diagonal entry (LT_colind[LT_adr] = r)
320+
int LT_adr = LT_rowadr[r];
321+
int LT_nnz = LT_rownnz[r];
322+
for (int k = 1; k < LT_nnz; k++) {
323+
int c = LT_colind[LT_adr + k]; // row c has L[c,r] != 0, c > r guaranteed
324+
325+
// get L[c,r] index directly from LT_map
326+
int L_cr_idx = LT_map[LT_adr + k];
327+
mjtNum L_cr = L[L_cr_idx];
328+
329+
// get row c info
330+
int c_adr = L_rowadr[c];
331+
332+
// dense[j] -= L[c,r] * L[c,j] for all j <= r in L[c]
333+
// L_cr_idx - c_adr gives the position of r in row c
334+
int num_cols = L_cr_idx - c_adr + 1;
335+
const int* colptr = L_colind + c_adr;
336+
const mjtNum* Lptr = L + c_adr;
337+
for (int i = 0; i < num_cols; i++) {
338+
dense[colptr[i]] -= L_cr * Lptr[i];
339+
}
340+
}
341+
342+
// factor row r diagonal, handle rank-deficient case
343+
mjtNum diag = dense[r];
344+
if (diag < mindiag) {
345+
diag = mindiag;
346+
rank--;
347+
}
348+
349+
// scale off-diagonals
350+
mjtNum L_rr = mju_sqrt(diag);
351+
mjtNum L_rr_inv = 1.0 / L_rr;
352+
int L_adr = L_rowadr[r];
353+
int L_nnz = L_rownnz[r];
354+
const int* colptr = L_colind + L_adr;
355+
mjtNum* Lptr = L + L_adr;
356+
for (int i = 0; i < L_nnz - 1; i++) {
357+
Lptr[i] = dense[colptr[i]] * L_rr_inv;
358+
}
359+
360+
// store diagonal
361+
L[L_adr + L_nnz - 1] = L_rr;
362+
363+
// clear dense workspace
364+
for (int i = 0; i < L_nnz; i++) {
365+
dense[colptr[i]] = 0;
366+
}
367+
}
368+
369+
mj_freeStack(d);
370+
return rank;
371+
}
243372

244373
// sparse reverse-order Cholesky solve
245374
void mju_cholSolveSparse(mjtNum* res, const mjtNum* mat, const mjtNum* vec, int n,
@@ -528,7 +657,7 @@ void mju_band2Dense(mjtNum* res, const mjtNum* mat, int ntotal, int nband, int n
528657
mju_zero(res, ntotal*ntotal);
529658

530659
// sparse part
531-
for(int i=0; i < nsparse; i++) {
660+
for (int i=0; i < nsparse; i++) {
532661
// number of non-zeros left of (i,i)
533662
int width = mjMIN(i, nband-1);
534663

@@ -537,13 +666,13 @@ void mju_band2Dense(mjtNum* res, const mjtNum* mat, int ntotal, int nband, int n
537666
}
538667

539668
// dense part
540-
for(int i=nsparse; i < ntotal; i++) {
669+
for (int i=nsparse; i < ntotal; i++) {
541670
mju_copy(res + i*ntotal, mat + nsparse*nband + (i-nsparse)*ntotal, i+1);
542671
}
543672

544673
// make symmetric
545674
if (flg_sym) {
546-
for(int i=0; i < ntotal; i++) {
675+
for (int i=0; i < ntotal; i++) {
547676
for (int j=i+1; j < ntotal; j++) {
548677
res[i*ntotal + j] = res[j*ntotal + i];
549678
}
@@ -557,7 +686,7 @@ void mju_dense2Band(mjtNum* res, const mjtNum* mat, int ntotal, int nband, int n
557686
int nsparse = ntotal-ndense;
558687

559688
// sparse part
560-
for(int i=0; i < nsparse; i++) {
689+
for (int i=0; i < nsparse; i++) {
561690
// number of non-zeros left of (i,i)
562691
int width = mjMIN(i, nband-1);
563692

@@ -566,7 +695,7 @@ void mju_dense2Band(mjtNum* res, const mjtNum* mat, int ntotal, int nband, int n
566695
}
567696

568697
// dense part
569-
for(int i=nsparse; i < ntotal; i++) {
698+
for (int i=nsparse; i < ntotal; i++) {
570699
mju_copy(res + nsparse*nband + (i-nsparse)*ntotal, mat + i*ntotal, i+1);
571700
}
572701
}
@@ -578,13 +707,13 @@ void mju_bandMulMatVec(mjtNum* res, const mjtNum* mat, const mjtNum* vec,
578707
int nsparse = ntotal-ndense;
579708

580709
// handle multiple vectors
581-
for(int j=0; j < nvec; j++ ) {
710+
for (int j=0; j < nvec; j++) {
582711
// precompute pointer to corresponding vector in vec and res
583712
const mjtNum* vec_j = vec + ntotal*j;
584713
mjtNum* res_j = res + ntotal*j;
585714

586715
// sparse part
587-
for(int i=0; i < nsparse; i++) {
716+
for (int i=0; i < nsparse; i++) {
588717
int width = mjMIN(i+1, nband);
589718
int adr = i*nband + nband - width;
590719
int offset = mjMAX(0, i-nband+1);
@@ -596,7 +725,7 @@ void mju_bandMulMatVec(mjtNum* res, const mjtNum* mat, const mjtNum* vec,
596725
}
597726

598727
// dense part
599-
for(int i=nsparse; i < ntotal; i++) {
728+
for (int i=nsparse; i < ntotal; i++) {
600729
int adr = nsparse*nband + (i-nsparse)*ntotal;
601730
res_j[i] = mju_dot(mat+adr, vec_j, i+1);
602731
if (flg_sym) {
@@ -808,7 +937,7 @@ int mju_eig3(mjtNum eigval[3], mjtNum eigvec[9], mjtNum quat[4], const mjtNum ma
808937
mju_normalize4(quat);
809938
}
810939

811-
// sort eigenvalues in decreasing order (bubblesort: 0, 1, 0)
940+
// sort eigenvalues in decreasing order (bubble sort: 0, 1, 0)
812941
for (int j=0; j < 3; j++) {
813942
int j1 = j%2; // lead index
814943

src/engine/engine_util_solve.h

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,25 @@ MJAPI int mju_cholUpdate(mjtNum* mat, mjtNum* x, int n, int flg_plus);
3636
MJAPI int mju_cholFactorSparse(mjtNum* mat, int n, mjtNum mindiag,
3737
int* rownnz, const int* rowadr, int* colind, mjData* d);
3838

39-
// precount row non-zeros of reverse-Cholesky factor L, return total
40-
MJAPI int mju_cholFactorCount(int* L_rownnz, const int* rownnz, const int* rowadr,
41-
const int* colind, int n, mjData* d);
39+
// symbolic reverse-Cholesky: compute both L (CSR) and LT (CSC) structures
40+
// if L_colind is NULL, perform counting logic (fill rownnz/rowadr arrays and return total nnz)
41+
// if L_colind is not NULL, assume rownnz/rowadr are precomputed and fill colind/map arrays
42+
// reads pattern from upper triangle
43+
// based on ldl_symbolic from 'Algorithm 8xx: a concise sparse Cholesky factorization package'
44+
MJAPI int mju_cholFactorSymbolic(int* L_colind, int* L_rownnz, int* L_rowadr,
45+
int* LT_colind, int* LT_rownnz, int* LT_rowadr, int* LT_map,
46+
const int* rownnz, const int* rowadr, const int* colind,
47+
int n, mjData* d);
48+
49+
// numeric reverse-Cholesky: compute L values given fixed sparsity pattern, returns rank
50+
// L_colind must already contain the correct sparsity pattern (from mju_cholFactorSymbolic)
51+
// LT_map[k] gives index in L for LT_colind[k]
52+
MJAPI int mju_cholFactorNumeric(mjtNum* L, int n, mjtNum mindiag,
53+
const int* L_rownnz, const int* L_rowadr, const int* L_colind,
54+
const int* LT_rownnz, const int* LT_rowadr, const int* LT_colind,
55+
const int* LT_map, const mjtNum* H,
56+
const int* H_rownnz, const int* H_rowadr, const int* H_colind,
57+
mjData* d);
4258

4359
// sparse reverse-order Cholesky solve
4460
void mju_cholSolveSparse(mjtNum* res, const mjtNum* mat, const mjtNum* vec, int n,

0 commit comments

Comments
 (0)