Skip to content

Commit bf2bec1

Browse files
committed
New approach to generate parallel layer
1 parent 29eca3a commit bf2bec1

File tree

1 file changed

+131
-96
lines changed

1 file changed

+131
-96
lines changed

highs/ipm/hipo/factorhighs/Analyse.cpp

Lines changed: 131 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1294,12 +1294,34 @@ void Analyse::computeStackSize() {
12941294
}
12951295
}
12961296

1297+
double assignToBins(std::vector<Int>& layer, std::vector<double>& ops,
1298+
Int node_to_ignore, Int n_bins) {
1299+
// Sort the layer in decreasing order of ops; allocate nodes in bins; return
1300+
// the ops in the largest bin.
1301+
1302+
std::sort(layer.begin(), layer.end(),
1303+
[&](Int a, Int b) { return ops[a] > ops[b]; });
1304+
1305+
std::vector<double> bins(n_bins, 0.0);
1306+
for (auto it = layer.begin(); it != layer.end(); ++it) {
1307+
if (*it == node_to_ignore) continue;
1308+
auto it_least_load = std::min_element(bins.begin(), bins.end());
1309+
*it_least_load += ops[*it];
1310+
}
1311+
1312+
return *std::max_element(bins.begin(), bins.end());
1313+
}
1314+
12971315
void Analyse::generateParallelLayer(Int threads) {
1298-
// Look for a layer that splits the tree such that there are at least "pieces"
1299-
// subtrees in the layer, with the largest being no more than "ratio_thresh"
1300-
// times more expensive than the second largest.
1301-
const Int pieces = 2;
1302-
const double ratio_thresh = 4;
1316+
// Find "optimal" splitting of the tree.
1317+
// This function finds a set of nodes (layer), such that each subtree starting
1318+
// from a node in the layer will be executed in parallel. Any node left above
1319+
// the layer is executed in serial. Subtrees that are too small to have their
1320+
// own parallel task are added to the set of small subtrees, which are also
1321+
// executed in serial.
1322+
1323+
// percentage of total ops below which a subtree is considered small
1324+
const double small_thresh_coeff = 0.01;
13031325

13041326
if (threads > 1) {
13051327
std::stringstream log_stream;
@@ -1343,13 +1365,22 @@ void Analyse::generateParallelLayer(Int threads) {
13431365
}
13441366
}
13451367

1346-
// Generate a layer that cuts the tree in two.
1347-
// Subtrees in the layer are processed in parallel.
1348-
// The remaining nodes are processed in serial at the end.
1349-
// Keep track of:
1350-
// - subtrees in the layer
1351-
// - nodes removed from layer that end up above
1352-
// - subtrees not added because too small
1368+
// How the layer is found:
1369+
// assignToBins returns the largest number of operations in any thread with
1370+
// a given layer L, called f(L).
1371+
// The parallelisability ratio of a given layer is measured as
1372+
// total_ops / (ops_above + f(L))
1373+
// We want this number to be as large as possible. Equivalently, we want the
1374+
// score = ops_above + f(L) to be as small as possible.
1375+
// For each node p in the layer, compute the score when the layer is
1376+
// L' = L \ {p} U {children of p}.
1377+
// The improvement to the score brough by this layer compared to the
1378+
// previous one is measured by
1379+
// f(L) - f(L') - ops_of_node
1380+
// If this quantity is positive, there is an improvement when choosing L'
1381+
// over L. If there are nodes with positive improvement, take the best one,
1382+
// remove that node from the layer and add its children. If no node brings
1383+
// an improvement, then stop.
13531384

13541385
std::vector<Int> layer;
13551386
aboveLayer_.clear();
@@ -1362,117 +1393,120 @@ void Analyse::generateParallelLayer(Int threads) {
13621393

13631394
double ops_above{};
13641395
double ops_small{};
1396+
1397+
const double small_thresh = small_thresh_coeff * total_ops;
1398+
13651399
Int iter = 0;
13661400
while (true) {
1367-
// sort layer so that nodes with high subtree_ops appear last
1368-
std::sort(layer.begin(), layer.end(),
1369-
[&](Int a, Int b) { return subtree_ops[a] < subtree_ops[b]; });
1370-
1371-
// Ratio of most expensive subtree in the layer and second most expensive
1372-
// one. If this ratio is not too large, then the layer gives good
1373-
// parallelism at least on 2 threads.
1374-
double ratio_first_two = 10;
1375-
if (layer.size() > 1)
1376-
ratio_first_two = subtree_ops[*layer.rbegin()] /
1377-
subtree_ops[*std::next(layer.rbegin(), 1)];
1378-
1379-
// log layer info
1380-
log_stream << " iter " << iter << ": "
1381-
<< "L " << layer.size() << ", A " << aboveLayer_.size() << "("
1382-
<< fix(ops_above / total_ops * 100, 0, 1) << "%), S "
1383-
<< smallSubtrees_.size() << "("
1384-
<< fix(ops_small / total_ops * 100, 0, 1) << "%), ratio "
1385-
<< (layer.size() > 1 ? fix(ratio_first_two, 0, 1) : "-")
1386-
<< "\n ";
1387-
for (Int i : layer) {
1388-
log_stream << " " << fix(sn_ops[i] / total_ops * 100, 0, 1) << "("
1389-
<< fix(subtree_ops[i] / total_ops * 100, 0, 1) << ")";
1390-
}
1401+
// choose to remove node which produces the greatest benefit
13911402
log_stream << "\n";
13921403

1393-
// if there are enough subtrees and they are somewhat balanced, stop
1394-
if (layer.size() >= pieces && ratio_first_two < ratio_thresh) {
1395-
log_stream << " Accept layer\n";
1396-
break;
1397-
}
1404+
double current_largest_bin =
1405+
assignToBins(layer, subtree_ops, -1, threads);
13981406

1399-
// don't allow too many iterations
1400-
if (iter > sn_count_ / 10) {
1401-
log_stream << " Too many iterations\n";
1402-
break;
1403-
}
1407+
double best_score = -kHighsInf;
1408+
std::vector<Int>::iterator best_it;
1409+
double best_largest_bin;
14041410

1405-
// find most expensive node in layer which have children
1406-
Int node_to_remove = -1;
1407-
Int index_to_remove = -1;
1408-
for (Int i = layer.size() - 1; i >= 0; --i) {
1409-
if (head[layer[i]] != -1) {
1410-
index_to_remove = i;
1411-
node_to_remove = layer[i];
1412-
break;
1411+
bool any_node_with_large_children = false;
1412+
1413+
// loop over all nodes in the current layer
1414+
for (auto it = layer.begin(); it != layer.end(); ++it) {
1415+
// build layer obtained adding children
1416+
std::vector<Int> local_layer = layer;
1417+
Int child = head[*it];
1418+
while (child != -1) {
1419+
if (subtree_ops[child] > small_thresh) {
1420+
local_layer.push_back(child);
1421+
any_node_with_large_children = true;
1422+
}
1423+
child = next[child];
14131424
}
1414-
}
1415-
if (node_to_remove == -1) {
1416-
log_stream << " No candidate left\n";
1417-
break;
1418-
}
14191425

1420-
// remove node from layer
1421-
auto it = layer.begin();
1422-
std::advance(it, index_to_remove);
1423-
layer.erase(it);
1424-
aboveLayer_.insert(node_to_remove);
1425-
ops_above += sn_ops[node_to_remove];
1426-
1427-
// find child with most operations
1428-
Int child_most_ops = -1;
1429-
double ops_child_most_ops = -1;
1430-
Int child = head[node_to_remove];
1431-
while (child != -1) {
1432-
if (subtree_ops[child] > ops_child_most_ops) {
1433-
ops_child_most_ops = subtree_ops[child];
1434-
child_most_ops = child;
1426+
// compute largest bin with this new layer
1427+
double largest_bin =
1428+
assignToBins(local_layer, subtree_ops, *it, threads);
1429+
1430+
double score = current_largest_bin - largest_bin - sn_ops[*it];
1431+
1432+
if (score > best_score) {
1433+
best_score = score;
1434+
best_it = it;
1435+
best_largest_bin = largest_bin;
14351436
}
1436-
child = next[child];
1437+
1438+
log_stream << "\t"
1439+
<< fix(total_ops / (ops_above + sn_ops[*it] + largest_bin),
1440+
0, 2)
1441+
<< " (" << sci(score, 0, 1) << ")\n";
14371442
}
14381443

1439-
const double small_subtree_thresh = 0.01;
1444+
log_stream << "Iter " << integer(iter) << ": ";
14401445

1441-
// If child with most operations is large enough, ignore.
1442-
// Otherwise, force at least this child to be added to layer.
1443-
// This guarantees that the layer does not shrink.
1444-
if (ops_child_most_ops > total_ops * small_subtree_thresh) {
1445-
child_most_ops = -1;
1446-
}
1446+
// no node brings a benefit
1447+
if (best_score < 0 || !any_node_with_large_children) {
1448+
log_stream << "fail\n";
1449+
break;
1450+
} else {
1451+
Int node_to_erase = *best_it;
1452+
layer.erase(best_it);
1453+
aboveLayer_.insert(node_to_erase);
1454+
ops_above += sn_ops[node_to_erase];
1455+
1456+
// find child with most operations
1457+
Int child = head[node_to_erase];
1458+
double largest_ops{};
1459+
double child_largest = -1;
1460+
while (child != -1) {
1461+
if (subtree_ops[child] > largest_ops) {
1462+
largest_ops = subtree_ops[child];
1463+
child_largest = child;
1464+
}
1465+
child = next[child];
1466+
}
14471467

1448-
child = head[node_to_remove];
1449-
while (child != -1) {
1450-
// Add child if it is large enough
1451-
// (don't want to produce parallel tasks for tiny subtrees).
1452-
// Otherwise, keep track of small subtrees to be processed in serial.
1453-
if (subtree_ops[child] > total_ops * small_subtree_thresh ||
1454-
child == child_most_ops) {
1455-
layer.push_back(child);
1456-
} else {
1457-
smallSubtrees_.insert(child);
1458-
ops_small += subtree_ops[child];
1468+
// If child with most operations is large enough, ignore.
1469+
// Otherwise, force at least this child to be added to layer.
1470+
// This guarantees that the layer does not shrink.
1471+
if (largest_ops > small_thresh) child_largest = -1;
1472+
1473+
child = head[node_to_erase];
1474+
while (child != -1) {
1475+
if (subtree_ops[child] > small_thresh || child == child_largest)
1476+
layer.push_back(child);
1477+
else {
1478+
smallSubtrees_.insert(child);
1479+
ops_small += subtree_ops[child];
1480+
}
1481+
child = next[child];
14591482
}
1460-
child = next[child];
1483+
1484+
log_stream << "ratio "
1485+
<< fix(total_ops / (ops_above + best_largest_bin), 0, 2)
1486+
<< ", layer " << integer(layer.size()) << '\n';
14611487
}
14621488

14631489
++iter;
14641490
}
14651491
// layer has been decided
14661492

1493+
log_stream << "\nLayer " << integer(layer.size()) << ": ";
1494+
for (Int i : layer)
1495+
log_stream << fix(subtree_ops[i] / total_ops * 100, 0, 1) << " ";
1496+
log_stream << "\nAbove " << fix(ops_above / total_ops * 100, 0, 1) << "% ("
1497+
<< integer(aboveLayer_.size()) << ")\n";
1498+
log_stream << "Small " << fix(ops_small / total_ops * 100, 0, 1) << "% ("
1499+
<< integer(smallSubtrees_.size()) << ")\n";
1500+
1501+
log_->printDevDetailed(log_stream);
1502+
14671503
// layerIndex stores pairs {i,j} indicating that node i is the j-th subtree
14681504
// in the layer.
14691505
Int index = 0;
14701506
for (auto it = layer.begin(); it != layer.end(); ++it) {
14711507
layerIndex_.insert({*it, index});
14721508
++index;
14731509
}
1474-
1475-
log_->printDevDetailed(log_stream);
14761510
}
14771511

14781512
// Compute the size of the stack needed to process the tree in serial. This is
@@ -1517,6 +1551,7 @@ void Analyse::generateParallelLayer(Int threads) {
15171551
layerSubtreesInfo_[index].stack = stack_subtree_parallel_[node];
15181552
}
15191553

1554+
// generate info about small subtrees
15201555
smallSubtreesInfo_.resize(smallSubtrees_.size());
15211556
Int index = 0;
15221557
for (auto it = smallSubtrees_.begin(); it != smallSubtrees_.end(); ++it) {

0 commit comments

Comments
 (0)