@@ -1294,12 +1294,34 @@ void Analyse::computeStackSize() {
12941294 }
12951295}
12961296
1297+ double assignToBins (std::vector<Int>& layer, std::vector<double >& ops,
1298+ Int node_to_ignore, Int n_bins) {
1299+ // Sort the layer in decreasing order of ops; allocate nodes in bins; return
1300+ // the ops in the largest bin.
1301+
1302+ std::sort (layer.begin (), layer.end (),
1303+ [&](Int a, Int b) { return ops[a] > ops[b]; });
1304+
1305+ std::vector<double > bins (n_bins, 0.0 );
1306+ for (auto it = layer.begin (); it != layer.end (); ++it) {
1307+ if (*it == node_to_ignore) continue ;
1308+ auto it_least_load = std::min_element (bins.begin (), bins.end ());
1309+ *it_least_load += ops[*it];
1310+ }
1311+
1312+ return *std::max_element (bins.begin (), bins.end ());
1313+ }
1314+
12971315void Analyse::generateParallelLayer (Int threads) {
1298- // Look for a layer that splits the tree such that there are at least "pieces"
1299- // subtrees in the layer, with the largest being no more than "ratio_thresh"
1300- // times more expensive than the second largest.
1301- const Int pieces = 2 ;
1302- const double ratio_thresh = 4 ;
1316+ // Find "optimal" splitting of the tree.
1317+ // This function finds a set of nodes (layer), such that each subtree starting
1318+ // from a node in the layer will be executed in parallel. Any node left above
1319+ // the layer is executed in serial. Subtrees that are too small to have their
1320+ // own parallel task are added to the set of small subtrees, which are also
1321+ // executed in serial.
1322+
1323+ // percentage of total ops below which a subtree is considered small
1324+ const double small_thresh_coeff = 0.01 ;
13031325
13041326 if (threads > 1 ) {
13051327 std::stringstream log_stream;
@@ -1343,13 +1365,22 @@ void Analyse::generateParallelLayer(Int threads) {
13431365 }
13441366 }
13451367
1346- // Generate a layer that cuts the tree in two.
1347- // Subtrees in the layer are processed in parallel.
1348- // The remaining nodes are processed in serial at the end.
1349- // Keep track of:
1350- // - subtrees in the layer
1351- // - nodes removed from layer that end up above
1352- // - subtrees not added because too small
1368+ // How the layer is found:
1369+ // assignToBins returns the largest number of operations in any thread with
1370+ // a given layer L, called f(L).
1371+ // The parallelisability ratio of a given layer is measured as
1372+ // total_ops / (ops_above + f(L))
1373+ // We want this number to be as large as possible. Equivalently, we want the
1374+ // score = ops_above + f(L) to be as small as possible.
1375+ // For each node p in the layer, compute the score when the layer is
1376+ // L' = L \ {p} U {children of p}.
1377+ // The improvement to the score brough by this layer compared to the
1378+ // previous one is measured by
1379+ // f(L) - f(L') - ops_of_node
1380+ // If this quantity is positive, there is an improvement when choosing L'
1381+ // over L. If there are nodes with positive improvement, take the best one,
1382+ // remove that node from the layer and add its children. If no node brings
1383+ // an improvement, then stop.
13531384
13541385 std::vector<Int> layer;
13551386 aboveLayer_.clear ();
@@ -1362,117 +1393,120 @@ void Analyse::generateParallelLayer(Int threads) {
13621393
13631394 double ops_above{};
13641395 double ops_small{};
1396+
1397+ const double small_thresh = small_thresh_coeff * total_ops;
1398+
13651399 Int iter = 0 ;
13661400 while (true ) {
1367- // sort layer so that nodes with high subtree_ops appear last
1368- std::sort (layer.begin (), layer.end (),
1369- [&](Int a, Int b) { return subtree_ops[a] < subtree_ops[b]; });
1370-
1371- // Ratio of most expensive subtree in the layer and second most expensive
1372- // one. If this ratio is not too large, then the layer gives good
1373- // parallelism at least on 2 threads.
1374- double ratio_first_two = 10 ;
1375- if (layer.size () > 1 )
1376- ratio_first_two = subtree_ops[*layer.rbegin ()] /
1377- subtree_ops[*std::next (layer.rbegin (), 1 )];
1378-
1379- // log layer info
1380- log_stream << " iter " << iter << " : "
1381- << " L " << layer.size () << " , A " << aboveLayer_.size () << " ("
1382- << fix (ops_above / total_ops * 100 , 0 , 1 ) << " %), S "
1383- << smallSubtrees_.size () << " ("
1384- << fix (ops_small / total_ops * 100 , 0 , 1 ) << " %), ratio "
1385- << (layer.size () > 1 ? fix (ratio_first_two, 0 , 1 ) : " -" )
1386- << " \n " ;
1387- for (Int i : layer) {
1388- log_stream << " " << fix (sn_ops[i] / total_ops * 100 , 0 , 1 ) << " ("
1389- << fix (subtree_ops[i] / total_ops * 100 , 0 , 1 ) << " )" ;
1390- }
1401+ // choose to remove node which produces the greatest benefit
13911402 log_stream << " \n " ;
13921403
1393- // if there are enough subtrees and they are somewhat balanced, stop
1394- if (layer.size () >= pieces && ratio_first_two < ratio_thresh) {
1395- log_stream << " Accept layer\n " ;
1396- break ;
1397- }
1404+ double current_largest_bin =
1405+ assignToBins (layer, subtree_ops, -1 , threads);
13981406
1399- // don't allow too many iterations
1400- if (iter > sn_count_ / 10 ) {
1401- log_stream << " Too many iterations\n " ;
1402- break ;
1403- }
1407+ double best_score = -kHighsInf ;
1408+ std::vector<Int>::iterator best_it;
1409+ double best_largest_bin;
14041410
1405- // find most expensive node in layer which have children
1406- Int node_to_remove = -1 ;
1407- Int index_to_remove = -1 ;
1408- for (Int i = layer.size () - 1 ; i >= 0 ; --i) {
1409- if (head[layer[i]] != -1 ) {
1410- index_to_remove = i;
1411- node_to_remove = layer[i];
1412- break ;
1411+ bool any_node_with_large_children = false ;
1412+
1413+ // loop over all nodes in the current layer
1414+ for (auto it = layer.begin (); it != layer.end (); ++it) {
1415+ // build layer obtained adding children
1416+ std::vector<Int> local_layer = layer;
1417+ Int child = head[*it];
1418+ while (child != -1 ) {
1419+ if (subtree_ops[child] > small_thresh) {
1420+ local_layer.push_back (child);
1421+ any_node_with_large_children = true ;
1422+ }
1423+ child = next[child];
14131424 }
1414- }
1415- if (node_to_remove == -1 ) {
1416- log_stream << " No candidate left\n " ;
1417- break ;
1418- }
14191425
1420- // remove node from layer
1421- auto it = layer.begin ();
1422- std::advance (it, index_to_remove);
1423- layer.erase (it);
1424- aboveLayer_.insert (node_to_remove);
1425- ops_above += sn_ops[node_to_remove];
1426-
1427- // find child with most operations
1428- Int child_most_ops = -1 ;
1429- double ops_child_most_ops = -1 ;
1430- Int child = head[node_to_remove];
1431- while (child != -1 ) {
1432- if (subtree_ops[child] > ops_child_most_ops) {
1433- ops_child_most_ops = subtree_ops[child];
1434- child_most_ops = child;
1426+ // compute largest bin with this new layer
1427+ double largest_bin =
1428+ assignToBins (local_layer, subtree_ops, *it, threads);
1429+
1430+ double score = current_largest_bin - largest_bin - sn_ops[*it];
1431+
1432+ if (score > best_score) {
1433+ best_score = score;
1434+ best_it = it;
1435+ best_largest_bin = largest_bin;
14351436 }
1436- child = next[child];
1437+
1438+ log_stream << " \t "
1439+ << fix (total_ops / (ops_above + sn_ops[*it] + largest_bin),
1440+ 0 , 2 )
1441+ << " (" << sci (score, 0 , 1 ) << " )\n " ;
14371442 }
14381443
1439- const double small_subtree_thresh = 0.01 ;
1444+ log_stream << " Iter " << integer (iter) << " : " ;
14401445
1441- // If child with most operations is large enough, ignore.
1442- // Otherwise, force at least this child to be added to layer.
1443- // This guarantees that the layer does not shrink.
1444- if (ops_child_most_ops > total_ops * small_subtree_thresh) {
1445- child_most_ops = -1 ;
1446- }
1446+ // no node brings a benefit
1447+ if (best_score < 0 || !any_node_with_large_children) {
1448+ log_stream << " fail\n " ;
1449+ break ;
1450+ } else {
1451+ Int node_to_erase = *best_it;
1452+ layer.erase (best_it);
1453+ aboveLayer_.insert (node_to_erase);
1454+ ops_above += sn_ops[node_to_erase];
1455+
1456+ // find child with most operations
1457+ Int child = head[node_to_erase];
1458+ double largest_ops{};
1459+ double child_largest = -1 ;
1460+ while (child != -1 ) {
1461+ if (subtree_ops[child] > largest_ops) {
1462+ largest_ops = subtree_ops[child];
1463+ child_largest = child;
1464+ }
1465+ child = next[child];
1466+ }
14471467
1448- child = head[node_to_remove];
1449- while (child != -1 ) {
1450- // Add child if it is large enough
1451- // (don't want to produce parallel tasks for tiny subtrees).
1452- // Otherwise, keep track of small subtrees to be processed in serial.
1453- if (subtree_ops[child] > total_ops * small_subtree_thresh ||
1454- child == child_most_ops) {
1455- layer.push_back (child);
1456- } else {
1457- smallSubtrees_.insert (child);
1458- ops_small += subtree_ops[child];
1468+ // If child with most operations is large enough, ignore.
1469+ // Otherwise, force at least this child to be added to layer.
1470+ // This guarantees that the layer does not shrink.
1471+ if (largest_ops > small_thresh) child_largest = -1 ;
1472+
1473+ child = head[node_to_erase];
1474+ while (child != -1 ) {
1475+ if (subtree_ops[child] > small_thresh || child == child_largest)
1476+ layer.push_back (child);
1477+ else {
1478+ smallSubtrees_.insert (child);
1479+ ops_small += subtree_ops[child];
1480+ }
1481+ child = next[child];
14591482 }
1460- child = next[child];
1483+
1484+ log_stream << " ratio "
1485+ << fix (total_ops / (ops_above + best_largest_bin), 0 , 2 )
1486+ << " , layer " << integer (layer.size ()) << ' \n ' ;
14611487 }
14621488
14631489 ++iter;
14641490 }
14651491 // layer has been decided
14661492
1493+ log_stream << " \n Layer " << integer (layer.size ()) << " : " ;
1494+ for (Int i : layer)
1495+ log_stream << fix (subtree_ops[i] / total_ops * 100 , 0 , 1 ) << " " ;
1496+ log_stream << " \n Above " << fix (ops_above / total_ops * 100 , 0 , 1 ) << " % ("
1497+ << integer (aboveLayer_.size ()) << " )\n " ;
1498+ log_stream << " Small " << fix (ops_small / total_ops * 100 , 0 , 1 ) << " % ("
1499+ << integer (smallSubtrees_.size ()) << " )\n " ;
1500+
1501+ log_->printDevDetailed (log_stream);
1502+
14671503 // layerIndex stores pairs {i,j} indicating that node i is the j-th subtree
14681504 // in the layer.
14691505 Int index = 0 ;
14701506 for (auto it = layer.begin (); it != layer.end (); ++it) {
14711507 layerIndex_.insert ({*it, index});
14721508 ++index;
14731509 }
1474-
1475- log_->printDevDetailed (log_stream);
14761510 }
14771511
14781512 // Compute the size of the stack needed to process the tree in serial. This is
@@ -1517,6 +1551,7 @@ void Analyse::generateParallelLayer(Int threads) {
15171551 layerSubtreesInfo_[index].stack = stack_subtree_parallel_[node];
15181552 }
15191553
1554+ // generate info about small subtrees
15201555 smallSubtreesInfo_.resize (smallSubtrees_.size ());
15211556 Int index = 0 ;
15221557 for (auto it = smallSubtrees_.begin (); it != smallSubtrees_.end (); ++it) {
0 commit comments