@@ -114,8 +114,8 @@ Int Analyse::getPermutation() {
114114
115115 // set logging of Metis depending on debug level
116116 options[METIS_OPTION_DBGLVL] = 0 ;
117- if (log_->debug (2 ))
118- options[METIS_OPTION_DBGLVL] = METIS_DBG_INFO | METIS_DBG_COARSEN;
117+ if (log_->debug (2 )) options[METIS_OPTION_DBGLVL] |= METIS_DBG_INFO;
118+ if (log_-> debug ( 3 )) options[METIS_OPTION_DBGLVL] |= METIS_DBG_COARSEN;
119119
120120 if (log_) log_->printDevInfo (" Running Metis\n " );
121121 Int status = METIS_NodeND (&n_, temp_ptr.data (), temp_rows.data (), NULL ,
@@ -1295,7 +1295,16 @@ void Analyse::computeStackSize() {
12951295}
12961296
12971297void Analyse::generateParallelLayer (Int threads) {
1298+ // Look for a layer that splits the tree such that there are at least "pieces"
1299+ // subtrees in the layer, with the largest being no more than "ratio_thresh"
1300+ // times more expensive than the second largest.
1301+ const Int pieces = 2 ;
1302+ const double ratio_thresh = 4 ;
1303+
12981304 if (threads > 1 ) {
1305+ std::stringstream log_stream;
1306+ log_stream << " Searching parallel layer\n " ;
1307+
12991308 // linked lists of children
13001309 std::vector<Int> head, next;
13011310 childrenLinkedList (sn_parent_, head, next);
@@ -1351,6 +1360,8 @@ void Analyse::generateParallelLayer(Int threads) {
13511360 if (sn_parent_[sn] == -1 ) layer.push_back (sn);
13521361 }
13531362
1363+ double ops_above{};
1364+ double ops_small{};
13541365 Int iter = 0 ;
13551366 while (true ) {
13561367 // sort layer so that nodes with high subtree_ops appear last
@@ -1365,15 +1376,31 @@ void Analyse::generateParallelLayer(Int threads) {
13651376 ratio_first_two = subtree_ops[*layer.rbegin ()] /
13661377 subtree_ops[*std::next (layer.rbegin (), 1 )];
13671378
1368- // printf("iter %d,layer %d, above %d, small %d, ratio %f\n", iter,
1369- // layer.size(), aboveLayer_.size(), smallSubtrees_.size(),
1370- // ratio_first_two);
1379+ // log layer info
1380+ log_stream << " iter " << iter << " : "
1381+ << " L " << layer.size () << " , A " << aboveLayer_.size () << " ("
1382+ << fix (ops_above / total_ops * 100 , 0 , 1 ) << " %), S "
1383+ << smallSubtrees_.size () << " ("
1384+ << fix (ops_small / total_ops * 100 , 0 , 1 ) << " %), ratio "
1385+ << (layer.size () > 1 ? fix (ratio_first_two, 0 , 1 ) : " -" )
1386+ << " \n " ;
1387+ for (Int i : layer) {
1388+ log_stream << " " << fix (sn_ops[i] / total_ops * 100 , 0 , 1 ) << " ("
1389+ << fix (subtree_ops[i] / total_ops * 100 , 0 , 1 ) << " )" ;
1390+ }
1391+ log_stream << " \n " ;
13711392
13721393 // if there are enough subtrees and they are somewhat balanced, stop
1373- if (layer.size () >= threads && ratio_first_two < 2 ) break ;
1394+ if (layer.size () >= pieces && ratio_first_two < ratio_thresh) {
1395+ log_stream << " Accept layer\n " ;
1396+ break ;
1397+ }
13741398
13751399 // don't allow too many iterations
1376- if (iter > sn_count_ / 10 ) break ;
1400+ if (iter > sn_count_ / 10 ) {
1401+ log_stream << " Too many iterations\n " ;
1402+ break ;
1403+ }
13771404
13781405 // find most expensive node in layer which have children
13791406 Int node_to_remove = -1 ;
@@ -1385,13 +1412,17 @@ void Analyse::generateParallelLayer(Int threads) {
13851412 break ;
13861413 }
13871414 }
1388- if (node_to_remove == -1 ) break ;
1415+ if (node_to_remove == -1 ) {
1416+ log_stream << " No candidate left\n " ;
1417+ break ;
1418+ }
13891419
13901420 // remove node from layer
13911421 auto it = layer.begin ();
13921422 std::advance (it, index_to_remove);
13931423 layer.erase (it);
13941424 aboveLayer_.insert (node_to_remove);
1425+ ops_above += sn_ops[node_to_remove];
13951426
13961427 // find child with most operations
13971428 Int child_most_ops = -1 ;
@@ -1405,7 +1436,7 @@ void Analyse::generateParallelLayer(Int threads) {
14051436 child = next[child];
14061437 }
14071438
1408- const double small_subtree_thresh = 0.001 ;
1439+ const double small_subtree_thresh = 0.01 ;
14091440
14101441 // If child with most operations is large enough, ignore.
14111442 // Otherwise, force at least this child to be added to layer.
@@ -1424,6 +1455,7 @@ void Analyse::generateParallelLayer(Int threads) {
14241455 layer.push_back (child);
14251456 } else {
14261457 smallSubtrees_.insert (child);
1458+ ops_small += subtree_ops[child];
14271459 }
14281460 child = next[child];
14291461 }
@@ -1439,6 +1471,8 @@ void Analyse::generateParallelLayer(Int threads) {
14391471 layerIndex_.insert ({*it, index});
14401472 ++index;
14411473 }
1474+
1475+ log_->printDevDetailed (log_stream);
14421476 }
14431477
14441478 // Compute the size of the stack needed to process the tree in serial. This is
0 commit comments