@@ -1368,15 +1368,18 @@ void Analyse::generateParallelLayer(Int threads) {
13681368 // How the layer is found:
13691369 // assignToBins returns the largest number of operations in any thread with
13701370 // a given layer L, called f(L).
1371+ // Subtrees that are too small to be included in the layer, according to a
1372+ // threshold, belong to the set of small subtrees S.
13711373 // The parallelisability ratio of a given layer is measured as
1372- // total_ops / (ops_above + f(L))
1374+ // total_ops / (ops_above + ops_small + f(L))
13731375 // We want this number to be as large as possible. Equivalently, we want the
1374- // score = ops_above + f(L) to be as small as possible.
1376+ // score = ops_above + ops_small + f(L) to be as small as possible.
13751377 // For each node p in the layer, compute the score when the layer is
13761378 // L' = L \ {p} U {children of p}.
13771379 // The improvement to the score brough by this layer compared to the
13781380 // previous one is measured by
1379- // f(L) - f(L') - ops_of_node
1381+ // f(L) - f(L') - S' - ops_of_node
1382+ // where S' is the new operations added to S due to the new layer.
13801383 // If this quantity is positive, there is an improvement when choosing L'
13811384 // over L. If there are nodes with positive improvement, take the best one,
13821385 // remove that node from the layer and add its children. If no node brings
@@ -1408,26 +1411,30 @@ void Analyse::generateParallelLayer(Int threads) {
14081411 std::vector<Int>::iterator best_it;
14091412 double best_largest_bin;
14101413
1411- bool any_node_with_large_children = false ;
1414+ bool any_node_with_children = false ;
14121415
14131416 // loop over all nodes in the current layer
14141417 for (auto it = layer.begin (); it != layer.end (); ++it) {
14151418 // build layer obtained adding children
14161419 std::vector<Int> local_layer = layer;
1420+ double local_small{};
14171421 Int child = head[*it];
14181422 while (child != -1 ) {
14191423 if (subtree_ops[child] > small_thresh) {
14201424 local_layer.push_back (child);
1421- any_node_with_large_children = true ;
1425+ } else {
1426+ local_small += subtree_ops[child];
14221427 }
1428+ any_node_with_children = true ;
14231429 child = next[child];
14241430 }
14251431
14261432 // compute largest bin with this new layer
14271433 double largest_bin =
14281434 assignToBins (local_layer, subtree_ops, *it, threads);
14291435
1430- double score = current_largest_bin - largest_bin - sn_ops[*it];
1436+ double score =
1437+ current_largest_bin - largest_bin - sn_ops[*it] - local_small;
14311438
14321439 if (score > best_score) {
14331440 best_score = score;
@@ -1436,15 +1443,17 @@ void Analyse::generateParallelLayer(Int threads) {
14361443 }
14371444
14381445 log_stream << " \t "
1439- << fix (total_ops / (ops_above + sn_ops[*it] + largest_bin),
1446+ << fix (total_ops / (ops_above + sn_ops[*it] + largest_bin +
1447+ ops_small + local_small),
14401448 0 , 2 )
1441- << " (" << sci (score, 0 , 1 ) << " )\n " ;
1449+ << " (" << sci (score, 0 , 1 ) << " ) <== " << integer (*it)
1450+ << " \n " ;
14421451 }
14431452
14441453 log_stream << " Iter " << integer (iter) << " : " ;
14451454
14461455 // no node brings a benefit
1447- if (best_score < 0 || !any_node_with_large_children ) {
1456+ if (best_score < 0 || !any_node_with_children ) {
14481457 log_stream << " fail\n " ;
14491458 break ;
14501459 } else {
@@ -1482,21 +1491,27 @@ void Analyse::generateParallelLayer(Int threads) {
14821491 }
14831492
14841493 log_stream << " ratio "
1485- << fix (total_ops / (ops_above + best_largest_bin), 0 , 2 )
1494+ << fix (total_ops /
1495+ (ops_above + best_largest_bin + ops_small),
1496+ 0 , 2 )
14861497 << " , layer " << integer (layer.size ()) << ' \n ' ;
14871498 }
14881499
14891500 ++iter;
14901501 }
14911502 // layer has been decided
14921503
1504+ double ratio = total_ops / (ops_above + ops_small +
1505+ assignToBins (layer, subtree_ops, -1 , threads));
1506+
14931507 log_stream << " \n Layer " << integer (layer.size ()) << " : " ;
14941508 for (Int i : layer)
14951509 log_stream << fix (subtree_ops[i] / total_ops * 100 , 0 , 1 ) << " " ;
14961510 log_stream << " \n Above " << fix (ops_above / total_ops * 100 , 0 , 1 ) << " % ("
14971511 << integer (aboveLayer_.size ()) << " )\n " ;
14981512 log_stream << " Small " << fix (ops_small / total_ops * 100 , 0 , 1 ) << " % ("
14991513 << integer (smallSubtrees_.size ()) << " )\n " ;
1514+ log_stream << " Parallel ratio " << fix (ratio, 0 , 2 ) << " \n " ;
15001515
15011516 log_->printDevDetailed (log_stream);
15021517
0 commit comments