@@ -92,6 +92,9 @@ public class LibMatrixReorg {
9292
9393 //use csr instead of mcsr sparse block for rexpand columns / diag v2m
9494 public static final boolean SPARSE_OUTPUTS_IN_CSR = true ;
95+
96+ //use legacy in-place transpose dense instead of brenner in-place transpose dense
97+ public static final boolean TRANSPOSE_IN_PLACE_DENSE_LEGACY = true ;
9598
9699 private enum ReorgType {
97100 TRANSPOSE ,
@@ -1789,19 +1792,23 @@ else if(cols == rows){
17891792 transposeInPlaceTrivial (in .getDenseBlockValues (), cols , k );
17901793 }
17911794 else {
1792- if (cols <rows ){
1793- // important to set dims after
1794- c2r (in , k );
1795- values .setDims (new int []{rows ,cols });
1796- in .setNumColumns (cols );
1797- in .setNumRows (rows );
1798- }
1799- else {
1800- // important to set dims before
1801- values .setDims (new int []{rows ,cols });
1802- in .setNumColumns (cols );
1803- in .setNumRows (rows );
1804- r2c (in , k );
1795+ if (TRANSPOSE_IN_PLACE_DENSE_LEGACY ) {
1796+ if (cols < rows ) {
1797+ // important to set dims after
1798+ c2r (in , k );
1799+ values .setDims (new int [] {rows , cols });
1800+ in .setNumColumns (cols );
1801+ in .setNumRows (rows );
1802+ }
1803+ else {
1804+ // important to set dims before
1805+ values .setDims (new int [] {rows , cols });
1806+ in .setNumColumns (cols );
1807+ in .setNumRows (rows );
1808+ r2c (in , k );
1809+ }
1810+ } else {
1811+ transposeInPlaceDenseBrenner (in , k );
18051812 }
18061813 }
18071814
@@ -4227,4 +4234,234 @@ public Object call() {
42274234 return null ;
42284235 }
42294236 }
4237+
4238+ /**
4239+ * Transposes a dense matrix in-place using following cycles based on Brenner's method. This
4240+ * method shifts cycles with a focus on less storage by using cycle leaders based on prime factorization. The used
4241+ * storage is in O(n+m). Quadratic matrices should be handled outside this method (using the trivial method) for a
4242+ * speedup. This method is based on: Algorithm 467, Brenner, https://dl.acm.org/doi/pdf/10.1145/355611.362542.
4243+ *
4244+ * @param in The input matrix to be transposed.
4245+ * @param k The number of threads.
4246+ */
4247+ public static void transposeInPlaceDenseBrenner (MatrixBlock in , int k ) {
4248+
4249+ DenseBlock denseBlock = in .getDenseBlock ();
4250+ double [] matrix = in .getDenseBlockValues ();
4251+
4252+ final int rows = in .getNumRows ();
4253+ final int cols = in .getNumColumns ();
4254+
4255+ // Brenner: rows + cols / 2 is sufficient for most cases.
4256+ int workSize = rows + cols ;
4257+ int maxIndex = rows * cols - 1 ;
4258+
4259+ // prime factorization of the maximum index to identify cycle structures
4260+ // Brenner: length 8 is sufficient up to maxIndex 2*3*5*...*19 = 9,767,520.
4261+ int [] primes = new int [8 ];
4262+ int [] exponents = new int [8 ];
4263+ int [] powers = new int [8 ];
4264+ int numPrimes = primeFactorization (maxIndex , primes , exponents , powers );
4265+
4266+ int [] iExponents = new int [numPrimes ];
4267+ int div = 1 ;
4268+
4269+ div :
4270+ while (div < maxIndex / 2 ) {
4271+
4272+ // number of indices divisible by div and no other divisor of maxIndex
4273+ int count = eulerTotient (primes , exponents , iExponents , numPrimes , maxIndex / div );
4274+ // all false
4275+ boolean [] moved = new boolean [workSize ];
4276+ // starting point cycle
4277+ int start = div ;
4278+
4279+ count :
4280+ do {
4281+ // companion of start
4282+ int comp = maxIndex - start ;
4283+
4284+ if (start == div ) {
4285+ // shift cycles
4286+ count = simultaneousCycleShift (matrix , moved , rows , maxIndex , count , workSize , start , comp );
4287+ start += div ;
4288+ }
4289+ else if (start < workSize && moved [start ]) {
4290+ // already moved
4291+ start += div ;
4292+ }
4293+ else {
4294+ // handle other cycle starts
4295+ int cycleLeader = start / div ;
4296+ for (int ip = 0 ; ip < numPrimes ; ip ++) {
4297+ if (iExponents [ip ] != exponents [ip ] && cycleLeader % primes [ip ] == 0 ) {
4298+ start += div ;
4299+ continue count ;
4300+ }
4301+ }
4302+
4303+ if (start < workSize ) {
4304+ count = simultaneousCycleShift (matrix , moved , rows , maxIndex , count , workSize , start , comp );
4305+ start += div ;
4306+ continue ;
4307+ }
4308+
4309+ int test = start ;
4310+ do {
4311+ test = prevIndexCycle (test , rows , cols );
4312+ if (test < start || test > comp ) {
4313+ start += div ;
4314+ continue count ;
4315+ }
4316+ }
4317+ while (test > start && test < comp );
4318+
4319+ count = simultaneousCycleShift (matrix , moved , rows , maxIndex , count , workSize , start , comp );
4320+ start += div ;
4321+ }
4322+ }
4323+ while (count > 0 );
4324+
4325+ // update cycle divisor for the next set of cycles based on prime factors
4326+ for (int ip = 0 ; ip < numPrimes ; ip ++) {
4327+ if (iExponents [ip ] != exponents [ip ]) {
4328+ iExponents [ip ]++;
4329+ div *= primes [ip ];
4330+ continue div ;
4331+ }
4332+ iExponents [ip ] = 0 ;
4333+ div /= powers [ip ];
4334+ }
4335+ }
4336+
4337+ denseBlock .setDims (new int [] {cols , rows });
4338+ in .setNumColumns (rows );
4339+ in .setNumRows (cols );
4340+ }
4341+
4342+ /**
4343+ * Performs a simultaneous cycle shift for a cycle and its companion cycle. This method ensures that distinct cycles
4344+ * or self-dual cycles are handled correctly. This method is based on: Algorithm 2, Karlsson,
4345+ * https://webapps.cs.umu.se/uminf/reports/2009/011/part1.pdf and Algorithm 467, Brenner,
4346+ * https://dl.acm.org/doi/pdf/10.1145/355611.362542.
4347+ *
4348+ * @param matrix The matrix whose elements are being shifted.
4349+ * @param moved Boolean array tracking whether an element has already been moved.
4350+ * @param rows The number of rows in the matrix.
4351+ * @param maxIndex The maximum valid index in the matrix.
4352+ * @param count The number of elements left to process.
4353+ * @param workSize The length of moved.
4354+ * @param start The starting index for the cycle shift.
4355+ * @param comp The corresponding companion index.
4356+ * @return The updated count of elements remaining to shift.
4357+ */
4358+ private static int simultaneousCycleShift (double [] matrix , boolean [] moved , int rows , int maxIndex , int count ,
4359+ int workSize , int start , int comp ) {
4360+
4361+ int orig = start ;
4362+ double val = matrix [orig ];
4363+ double cval = matrix [comp ];
4364+
4365+ while (orig >= 0 ) {
4366+ // decrease the remaining shift count by orig and comp
4367+ count -= 2 ;
4368+ orig = simultaneousCycleShiftStep (matrix , moved , rows , maxIndex , workSize , start , orig , val , cval );
4369+ }
4370+ return count ;
4371+ }
4372+
4373+ private static int simultaneousCycleShiftStep (double [] matrix , boolean [] moved , int rows , int maxIndex ,
4374+ int workSize , int start , int orig , double val , double cval ) {
4375+
4376+ int comp = maxIndex - orig ;
4377+ int prevOrig = prevIndexCycle (orig , rows , (maxIndex + 1 ) / rows );
4378+ int prevComp = maxIndex - prevOrig ;
4379+
4380+ if (orig < workSize )
4381+ moved [orig ] = true ;
4382+ if (comp < workSize )
4383+ moved [comp ] = true ;
4384+
4385+ if (prevOrig == start ) {
4386+ // cycle and comp are distinct
4387+ matrix [orig ] = val ;
4388+ matrix [comp ] = cval ;
4389+ return -1 ;
4390+ }
4391+ else if (prevComp == start ) {
4392+ // cycle is self dual
4393+ matrix [orig ] = cval ;
4394+ matrix [comp ] = val ;
4395+ return -1 ;
4396+ }
4397+
4398+ // shift the values to their next positions
4399+ matrix [orig ] = matrix [prevOrig ];
4400+ matrix [comp ] = matrix [prevComp ];
4401+ // update
4402+ return prevOrig ;
4403+ }
4404+
4405+ private static int prevIndexCycle (int index , int rows , int cols ) {
4406+ int lastIndex = rows * cols - 1 ;
4407+ if (index == lastIndex )
4408+ return lastIndex ;
4409+ long temp = (long ) index * cols ;
4410+ return (int ) (temp % lastIndex );
4411+ }
4412+
4413+ /**
4414+ * Performs prime factorization of a given number n. The method calculates the prime factors of n, their exponents,
4415+ * powers and stores the results in the provided arrays.
4416+ *
4417+ * @param n The number to be factorized.
4418+ * @param primes Array to store the unique prime factors of n.
4419+ * @param exponents Array to store the exponents of the respective prime factors.
4420+ * @param powers Array to store the powers of the respective prime factors.
4421+ * @return The number of unique prime factors.
4422+ */
4423+ private static int primeFactorization (int n , int [] primes , int [] exponents , int [] powers ) {
4424+ int pIdx = -1 ;
4425+ int currDiv = 0 ;
4426+ int rest = n ;
4427+ int div = 2 ;
4428+
4429+ while (rest > 1 ) {
4430+ int quotient = rest / div ;
4431+ if (rest - div * quotient == 0 ) {
4432+ if (div == currDiv ) {
4433+ // current divisor is the same as the last one
4434+ powers [pIdx ] *= div ;
4435+ exponents [pIdx ]++;
4436+ }
4437+ else {
4438+ // new prime factor found
4439+ pIdx ++;
4440+ if (pIdx >= primes .length )
4441+ throw new RuntimeException ("Not enough space, need to expand input arrays." );
4442+
4443+ primes [pIdx ] = div ;
4444+ powers [pIdx ] = div ;
4445+ currDiv = div ;
4446+ exponents [pIdx ] = 1 ;
4447+ }
4448+ rest = quotient ;
4449+ }
4450+ else {
4451+ // only odd divs
4452+ div = (div == 2 ) ? 3 : div + 2 ;
4453+ }
4454+ }
4455+ return pIdx + 1 ;
4456+ }
4457+
4458+ private static int eulerTotient (int [] primes , int [] exponents , int [] iExponents , int numPrimes , int count ) {
4459+ for (int ip = 0 ; ip < numPrimes ; ip ++) {
4460+ if (iExponents [ip ] == exponents [ip ]) {
4461+ continue ;
4462+ }
4463+ count = (count / primes [ip ]) * (primes [ip ] - 1 );
4464+ }
4465+ return count ;
4466+ }
42304467}
0 commit comments