Skip to content

Commit 0b5fae9

Browse files
jessicapriebemboehm7
authored andcommitted
[SYSTEMDS-3547] Additional transpose in-place kernel (Brenner)
DIA WiSe 24/25 project Closes #2199.
1 parent c66b39f commit 0b5fae9

File tree

2 files changed

+360
-13
lines changed

2 files changed

+360
-13
lines changed

src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java

Lines changed: 250 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ public class LibMatrixReorg {
9292

9393
//use csr instead of mcsr sparse block for rexpand columns / diag v2m
9494
public static final boolean SPARSE_OUTPUTS_IN_CSR = true;
95+
96+
//use legacy in-place transpose dense instead of brenner in-place transpose dense
97+
public static final boolean TRANSPOSE_IN_PLACE_DENSE_LEGACY = true;
9598

9699
private enum ReorgType {
97100
TRANSPOSE,
@@ -1789,19 +1792,23 @@ else if(cols == rows){
17891792
transposeInPlaceTrivial(in.getDenseBlockValues(), cols, k);
17901793
}
17911794
else {
1792-
if(cols<rows){
1793-
// important to set dims after
1794-
c2r(in, k);
1795-
values.setDims(new int[]{rows,cols});
1796-
in.setNumColumns(cols);
1797-
in.setNumRows(rows);
1798-
}
1799-
else{
1800-
// important to set dims before
1801-
values.setDims(new int[]{rows,cols});
1802-
in.setNumColumns(cols);
1803-
in.setNumRows(rows);
1804-
r2c(in, k);
1795+
if(TRANSPOSE_IN_PLACE_DENSE_LEGACY) {
1796+
if(cols < rows) {
1797+
// important to set dims after
1798+
c2r(in, k);
1799+
values.setDims(new int[] {rows, cols});
1800+
in.setNumColumns(cols);
1801+
in.setNumRows(rows);
1802+
}
1803+
else {
1804+
// important to set dims before
1805+
values.setDims(new int[] {rows, cols});
1806+
in.setNumColumns(cols);
1807+
in.setNumRows(rows);
1808+
r2c(in, k);
1809+
}
1810+
} else {
1811+
transposeInPlaceDenseBrenner(in, k);
18051812
}
18061813
}
18071814

@@ -4227,4 +4234,234 @@ public Object call() {
42274234
return null;
42284235
}
42294236
}
4237+
4238+
/**
4239+
* Transposes a dense matrix in-place using following cycles based on Brenner's method. This
4240+
* method shifts cycles with a focus on less storage by using cycle leaders based on prime factorization. The used
4241+
* storage is in O(n+m). Quadratic matrices should be handled outside this method (using the trivial method) for a
4242+
* speedup. This method is based on: Algorithm 467, Brenner, https://dl.acm.org/doi/pdf/10.1145/355611.362542.
4243+
*
4244+
* @param in The input matrix to be transposed.
4245+
* @param k The number of threads.
4246+
*/
4247+
public static void transposeInPlaceDenseBrenner(MatrixBlock in, int k) {
4248+
4249+
DenseBlock denseBlock = in.getDenseBlock();
4250+
double[] matrix = in.getDenseBlockValues();
4251+
4252+
final int rows = in.getNumRows();
4253+
final int cols = in.getNumColumns();
4254+
4255+
// Brenner: rows + cols / 2 is sufficient for most cases.
4256+
int workSize = rows + cols;
4257+
int maxIndex = rows * cols - 1;
4258+
4259+
// prime factorization of the maximum index to identify cycle structures
4260+
// Brenner: length 8 is sufficient up to maxIndex 2*3*5*...*19 = 9,767,520.
4261+
int[] primes = new int[8];
4262+
int[] exponents = new int[8];
4263+
int[] powers = new int[8];
4264+
int numPrimes = primeFactorization(maxIndex, primes, exponents, powers);
4265+
4266+
int[] iExponents = new int[numPrimes];
4267+
int div = 1;
4268+
4269+
div:
4270+
while(div < maxIndex / 2) {
4271+
4272+
// number of indices divisible by div and no other divisor of maxIndex
4273+
int count = eulerTotient(primes, exponents, iExponents, numPrimes, maxIndex / div);
4274+
// all false
4275+
boolean[] moved = new boolean[workSize];
4276+
// starting point cycle
4277+
int start = div;
4278+
4279+
count:
4280+
do {
4281+
// companion of start
4282+
int comp = maxIndex - start;
4283+
4284+
if(start == div) {
4285+
// shift cycles
4286+
count = simultaneousCycleShift(matrix, moved, rows, maxIndex, count, workSize, start, comp);
4287+
start += div;
4288+
}
4289+
else if(start < workSize && moved[start]) {
4290+
// already moved
4291+
start += div;
4292+
}
4293+
else {
4294+
// handle other cycle starts
4295+
int cycleLeader = start / div;
4296+
for(int ip = 0; ip < numPrimes; ip++) {
4297+
if(iExponents[ip] != exponents[ip] && cycleLeader % primes[ip] == 0) {
4298+
start += div;
4299+
continue count;
4300+
}
4301+
}
4302+
4303+
if(start < workSize) {
4304+
count = simultaneousCycleShift(matrix, moved, rows, maxIndex, count, workSize, start, comp);
4305+
start += div;
4306+
continue;
4307+
}
4308+
4309+
int test = start;
4310+
do {
4311+
test = prevIndexCycle(test, rows, cols);
4312+
if(test < start || test > comp) {
4313+
start += div;
4314+
continue count;
4315+
}
4316+
}
4317+
while(test > start && test < comp);
4318+
4319+
count = simultaneousCycleShift(matrix, moved, rows, maxIndex, count, workSize, start, comp);
4320+
start += div;
4321+
}
4322+
}
4323+
while(count > 0);
4324+
4325+
// update cycle divisor for the next set of cycles based on prime factors
4326+
for(int ip = 0; ip < numPrimes; ip++) {
4327+
if(iExponents[ip] != exponents[ip]) {
4328+
iExponents[ip]++;
4329+
div *= primes[ip];
4330+
continue div;
4331+
}
4332+
iExponents[ip] = 0;
4333+
div /= powers[ip];
4334+
}
4335+
}
4336+
4337+
denseBlock.setDims(new int[] {cols, rows});
4338+
in.setNumColumns(rows);
4339+
in.setNumRows(cols);
4340+
}
4341+
4342+
/**
4343+
* Performs a simultaneous cycle shift for a cycle and its companion cycle. This method ensures that distinct cycles
4344+
* or self-dual cycles are handled correctly. This method is based on: Algorithm 2, Karlsson,
4345+
* https://webapps.cs.umu.se/uminf/reports/2009/011/part1.pdf and Algorithm 467, Brenner,
4346+
* https://dl.acm.org/doi/pdf/10.1145/355611.362542.
4347+
*
4348+
* @param matrix The matrix whose elements are being shifted.
4349+
* @param moved Boolean array tracking whether an element has already been moved.
4350+
* @param rows The number of rows in the matrix.
4351+
* @param maxIndex The maximum valid index in the matrix.
4352+
* @param count The number of elements left to process.
4353+
* @param workSize The length of moved.
4354+
* @param start The starting index for the cycle shift.
4355+
* @param comp The corresponding companion index.
4356+
* @return The updated count of elements remaining to shift.
4357+
*/
4358+
private static int simultaneousCycleShift(double[] matrix, boolean[] moved, int rows, int maxIndex, int count,
4359+
int workSize, int start, int comp) {
4360+
4361+
int orig = start;
4362+
double val = matrix[orig];
4363+
double cval = matrix[comp];
4364+
4365+
while(orig >= 0) {
4366+
// decrease the remaining shift count by orig and comp
4367+
count -= 2;
4368+
orig = simultaneousCycleShiftStep(matrix, moved, rows, maxIndex, workSize, start, orig, val, cval);
4369+
}
4370+
return count;
4371+
}
4372+
4373+
private static int simultaneousCycleShiftStep(double[] matrix, boolean[] moved, int rows, int maxIndex,
4374+
int workSize, int start, int orig, double val, double cval) {
4375+
4376+
int comp = maxIndex - orig;
4377+
int prevOrig = prevIndexCycle(orig, rows, (maxIndex + 1) / rows);
4378+
int prevComp = maxIndex - prevOrig;
4379+
4380+
if(orig < workSize)
4381+
moved[orig] = true;
4382+
if(comp < workSize)
4383+
moved[comp] = true;
4384+
4385+
if(prevOrig == start) {
4386+
// cycle and comp are distinct
4387+
matrix[orig] = val;
4388+
matrix[comp] = cval;
4389+
return -1;
4390+
}
4391+
else if(prevComp == start) {
4392+
// cycle is self dual
4393+
matrix[orig] = cval;
4394+
matrix[comp] = val;
4395+
return -1;
4396+
}
4397+
4398+
// shift the values to their next positions
4399+
matrix[orig] = matrix[prevOrig];
4400+
matrix[comp] = matrix[prevComp];
4401+
// update
4402+
return prevOrig;
4403+
}
4404+
4405+
private static int prevIndexCycle(int index, int rows, int cols) {
4406+
int lastIndex = rows * cols - 1;
4407+
if(index == lastIndex)
4408+
return lastIndex;
4409+
long temp = (long) index * cols;
4410+
return (int) (temp % lastIndex);
4411+
}
4412+
4413+
/**
4414+
* Performs prime factorization of a given number n. The method calculates the prime factors of n, their exponents,
4415+
* powers and stores the results in the provided arrays.
4416+
*
4417+
* @param n The number to be factorized.
4418+
* @param primes Array to store the unique prime factors of n.
4419+
* @param exponents Array to store the exponents of the respective prime factors.
4420+
* @param powers Array to store the powers of the respective prime factors.
4421+
* @return The number of unique prime factors.
4422+
*/
4423+
private static int primeFactorization(int n, int[] primes, int[] exponents, int[] powers) {
4424+
int pIdx = -1;
4425+
int currDiv = 0;
4426+
int rest = n;
4427+
int div = 2;
4428+
4429+
while(rest > 1) {
4430+
int quotient = rest / div;
4431+
if(rest - div * quotient == 0) {
4432+
if(div == currDiv) {
4433+
// current divisor is the same as the last one
4434+
powers[pIdx] *= div;
4435+
exponents[pIdx]++;
4436+
}
4437+
else {
4438+
// new prime factor found
4439+
pIdx++;
4440+
if(pIdx >= primes.length)
4441+
throw new RuntimeException("Not enough space, need to expand input arrays.");
4442+
4443+
primes[pIdx] = div;
4444+
powers[pIdx] = div;
4445+
currDiv = div;
4446+
exponents[pIdx] = 1;
4447+
}
4448+
rest = quotient;
4449+
}
4450+
else {
4451+
// only odd divs
4452+
div = (div == 2) ? 3 : div + 2;
4453+
}
4454+
}
4455+
return pIdx + 1;
4456+
}
4457+
4458+
private static int eulerTotient(int[] primes, int[] exponents, int[] iExponents, int numPrimes, int count) {
4459+
for(int ip = 0; ip < numPrimes; ip++) {
4460+
if(iExponents[ip] == exponents[ip]) {
4461+
continue;
4462+
}
4463+
count = (count / primes[ip]) * (primes[ip] - 1);
4464+
}
4465+
return count;
4466+
}
42304467
}
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.sysds.test.component.matrix.libMatrixReorg;
21+
22+
import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
23+
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
24+
import org.apache.sysds.test.TestUtils;
25+
import org.junit.Test;
26+
27+
import static org.junit.Assert.assertThrows;
28+
import static org.junit.Assert.assertTrue;
29+
30+
public class TransposeInPlaceBrennerTest {
31+
32+
@Test
33+
public void transposeInPlaceDenseBrennerOnePrime() {
34+
// 3*4-1 = 11
35+
testTransposeInPlaceDense(3, 4, 1);
36+
}
37+
38+
@Test
39+
public void transposeInPlaceDenseBrennerTwoPrimes() {
40+
// 4*9-1 = 5*7
41+
testTransposeInPlaceDense(4, 9, 0.96);
42+
}
43+
44+
@Test
45+
public void transposeInPlaceDenseBrennerThreePrimes() {
46+
// 2*53-1 = 3*5*7
47+
testTransposeInPlaceDense(2, 53, 0.52);
48+
}
49+
50+
@Test
51+
public void transposeInPlaceDenseBrennerThreePrimesOneExpo() {
52+
// 1151*2999-1 = (2**3)*3*143827
53+
testTransposeInPlaceDense(1151, 2999, 0.82);
54+
}
55+
56+
@Test
57+
public void transposeInPlaceDenseBrennerThreePrimesAllExpos() {
58+
// 9*10889-1 = (2**4)*(5**3)*(7**2)
59+
testTransposeInPlaceDense(9, 10889, 0.74);
60+
}
61+
62+
@Test
63+
public void transposeInPlaceDenseBrennerFourPrimesOneExpo() {
64+
// 53*4421-1 = (2**3)*3*13*751
65+
testTransposeInPlaceDense(53, 4421, 0.75);
66+
}
67+
68+
@Test
69+
public void transposeInPlaceDenseBrennerFivePrimes() {
70+
// 3*3337-1 = 2*5*7*11*13
71+
testTransposeInPlaceDense(3, 3337, 0.68);
72+
}
73+
74+
@Test
75+
public void transposeInPlaceDenseBrennerSixPrimesOneExpo() {
76+
// 53*7177-1 = (2**2)*5*7*11*13*19
77+
testTransposeInPlaceDense(53, 7177, 0.78);
78+
}
79+
80+
@Test
81+
public void transposeInPlaceDenseBrennerSevenPrimesThreeExpos() {
82+
// 2087*17123-1 = (2**2)*3*(5**2)*(7**2)*11*13*17
83+
testTransposeInPlaceDense(2087, 17123, 0.79);
84+
}
85+
86+
@Test
87+
public void transposeInPlaceDenseBrennerEightPrimes() {
88+
// 347*27953-1 = 2*3*5*7*11*13*17*19
89+
testTransposeInPlaceDense(347, 27953, 0.86);
90+
}
91+
92+
@Test
93+
public void transposeInPlaceDenseBrennerNinePrimes() {
94+
// 317*703763-1 = 2*3*5*7*11*13*17*19*23
95+
MatrixBlock X = MatrixBlock.randOperations(317, 703763, 0.52);
96+
97+
Exception exception = assertThrows(RuntimeException.class,
98+
() -> LibMatrixReorg.transposeInPlaceDenseBrenner(X, 1));
99+
assertTrue(exception.getMessage().contains("Not enough space, need to expand input arrays."));
100+
}
101+
102+
private void testTransposeInPlaceDense(int rows, int cols, double sparsity) {
103+
MatrixBlock X = MatrixBlock.randOperations(rows, cols, sparsity);
104+
MatrixBlock tX = LibMatrixReorg.transpose(X);
105+
106+
LibMatrixReorg.transposeInPlaceDenseBrenner(X, 1);
107+
108+
TestUtils.compareMatrices(X, tX, 0);
109+
}
110+
}

0 commit comments

Comments
 (0)