Skip to content

Commit c940502

Browse files
committed
[SYSTEMDS-3773] Fix two matmult kernels w/ parallelization over rhs
The matmult kernel library parallelizes by default over rows in the left-hand-side matrix, but for specific size regimes, switches to a parallelization over the rows or columns of the right-hand-side. The recently added full-coverage tests found two bug, which this patch fixes a) dense-dense matrix-vector multiplication w/ large vectors --> extended implementation to support the other parallelization b) sparse-dense vector-vector dot product --> disable parallelization for this specific case to use the existing kernel without binary searches
1 parent 8af6bda commit c940502

File tree

2 files changed

+26
-16
lines changed

2 files changed

+26
-16
lines changed

src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ private static void parallelMatrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlo
279279
boolean pm2r = !ultraSparse && !sparse && checkParMatrixMultRightInputRows(m1, m2, k);
280280
boolean pm2c = !ultraSparse && checkParMatrixMultRightInputCols(m1, m2, k, pm2r);
281281
int num = pm2r ? m2.rlen : pm2c ? m2.clen : m1.rlen;
282-
282+
283283
// core multi-threaded matrix mult computation
284284
// (currently: always parallelization over number of rows)
285285
final ExecutorService pool = CommonThreadPool.get(k);
@@ -1052,7 +1052,7 @@ else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
10521052
matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
10531053
}
10541054
else if( n==1 ) { //MATRIX-VECTOR (tall rhs)
1055-
matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
1055+
matrixMultDenseDenseMVTallRHS(a, b, c, pm2, cd, rl, ru);
10561056
}
10571057
else if( pm2 && m==1 ) { //VECTOR-MATRIX
10581058
matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
@@ -1075,15 +1075,20 @@ private static void matrixMultDenseDenseMVShortRHS(DenseBlock a, DenseBlock b, D
10751075
cvals[i] = dotProduct(a.values(i), bvals, a.pos(i), 0, cd);
10761076
}
10771077

1078-
private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
1078+
private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, DenseBlock b, DenseBlock c, boolean pm2, int cd, int rl, int ru) {
10791079
final int blocksizeI = 32;
10801080
final int blocksizeK = 2*1024; //16KB vector blocks (L1)
10811081
double[] bvals = b.valuesAt(0);
10821082
double[] cvals = c.valuesAt(0);
1083-
for( int bi=rl; bi<ru; bi+=blocksizeI ) {
1084-
int bimin = Math.min(bi+blocksizeI, ru);
1085-
for( int bk=0; bk<cd; bk+=blocksizeK ) {
1086-
int bkmin = Math.min(bk+blocksizeK, cd);
1083+
// setup bounds according to parallelization strategy
1084+
// (default: rows in lhs, pm2: rows in rhs)
1085+
int cl = pm2 ? rl : 0, cu = pm2 ? ru : cd;
1086+
int rl2 = pm2 ? 0 : rl, ru2 = pm2 ? a.numRows() : ru;
1087+
// matrix-vector multication with cache blocking of vector
1088+
for( int bi=rl2; bi<ru2; bi+=blocksizeI ) {
1089+
int bimin = Math.min(bi+blocksizeI, ru2);
1090+
for( int bk=cl; bk<cu; bk+=blocksizeK ) {
1091+
int bkmin = Math.min(bk+blocksizeK, cu);
10871092
for( int i=bi; i<bimin; i++)
10881093
cvals[i] += dotProduct(a.values(i), bvals, a.pos(i,bk), bk, bkmin-bk);
10891094
}
@@ -4349,7 +4354,7 @@ public static boolean isSkinnyRightHandSide(long m1rlen, long m1clen, long m2rle
43494354
private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) {
43504355
//parallelize over rows in rhs matrix if number of rows in lhs/output is very small
43514356
double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
4352-
return (m1.rlen==1 && !(m1.isUltraSparse()||m2.isUltraSparse()))
4357+
return (m1.rlen==1 && !(m1.sparse && m2.clen==1) && !(m1.isUltraSparse()||m2.isUltraSparse()))
43534358
|| (m1.rlen<=16 && m2.rlen > m1.rlen && (!m1.sparse | m2.clen > 1)
43544359
&& ( !m1.isUltraSparse() && !(m1.sparse & m2.sparse) ) //dense-dense / sparse-dense / dense-sparse
43554360
&& (long)k * 8 * m1.rlen * m2.clen < Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem) );

src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyKernelTest.java

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,15 @@ public void testDenseDenseMatrixSmallVector() {
5151
testMatrixMultiply(MIN_PAR, 16, 1, 1, 1);
5252
}
5353

54-
// @Test //FIXME
55-
// public void testDenseDenseMatrixLargeVector() {
56-
// testMatrixMultiply(16, MIN_PAR, 1, 1, 1);
57-
// }
54+
@Test //parallelization over rows in lhs
55+
public void testDenseDenseMatrixLargeVector() {
56+
testMatrixMultiply(4000, 3000, 1, 1, 1);
57+
}
58+
59+
@Test //parallelization over rows in rhs
60+
public void testDenseDenseMatrixLargeVectorPm2() {
61+
testMatrixMultiply(16, MIN_PAR, 1, 1, 1);
62+
}
5863

5964
@Test
6065
public void testDenseDenseVectorMatrix() {
@@ -90,10 +95,10 @@ public void testDenseSparseMatrixMatrix() {
9095

9196
// sparse-dense kernels
9297

93-
// @Test FIXME
94-
// public void testSparseDenseDotProduct() {
95-
// testMatrixMultiply(1, MIN_PAR, 1, 0.1, 1);
96-
// }
98+
@Test
99+
public void testSparseDenseDotProduct() {
100+
testMatrixMultiply(1, MIN_PAR, 1, 0.1, 1);
101+
}
97102

98103
@Test
99104
public void testSparseDenseMatrixSmallVector() {

0 commit comments

Comments
 (0)