Skip to content

Commit e84f738

Browse files
author
rhc54
authored
Merge pull request #2587 from rhc54/topic/oversub
Ensure that we don't bind-by-default in an oversubscribed condition
2 parents a019095 + 2af677b commit e84f738

File tree

8 files changed

+22
-0
lines changed

8 files changed

+22
-0
lines changed

orte/mca/rmaps/base/rmaps_base_map_job.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,16 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
327327
return;
328328
}
329329

330+
/* if any node is oversubscribed, then check to see if a binding
331+
* directive was given - if not, then we want to clear the default
332+
* binding policy so we don't attempt to bind */
333+
if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED)) {
334+
if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
335+
/* clear any default binding policy we might have set */
336+
OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
337+
}
338+
}
339+
330340
/* compute and save local ranks */
331341
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
332342
ORTE_ERROR_LOG(rc);

orte/mca/rmaps/mindist/rmaps_mindist_module.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ static int mindist_map(orte_job_t *jdata)
230230
goto error;
231231
} else {
232232
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
233+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
233234
}
234235
}
235236
}
@@ -349,6 +350,7 @@ static int mindist_map(orte_job_t *jdata)
349350
if (nprocs_mapped == app->num_procs)
350351
break;
351352
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
353+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
352354
opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
353355
"mca:rmaps:mindist: second pass assigning %d extra procs to node %s",
354356
(int)num_procs_to_assign, node->name);

orte/mca/rmaps/ppr/rmaps_ppr.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ static int ppr_mapper(orte_job_t *jdata)
351351
* properly set
352352
*/
353353
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
354+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
354355
/* check for permission */
355356
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
356357
/* if we weren't given a directive either way, then we will error out

orte/mca/rmaps/rank_file/rmaps_rank_file.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ static int orte_rmaps_rf_map(orte_job_t *jdata)
299299
* properly set
300300
*/
301301
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
302+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
302303
}
303304
/* set the vpid */
304305
proc->name.vpid = rank;

orte/mca/rmaps/resilient/rmaps_resilient.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,7 @@ static int map_to_ftgrps(orte_job_t *jdata)
840840
* properly set
841841
*/
842842
ORTE_FLAG_SET(nd, ORTE_NODE_FLAG_OVERSUBSCRIBED);
843+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
843844
}
844845

845846
/* track number of procs mapped */

orte/mca/rmaps/round_robin/rmaps_rr_mappers.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ int orte_rmaps_rr_byslot(orte_job_t *jdata,
184184
* properly set
185185
*/
186186
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
187+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
187188
/* check for permission */
188189
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
189190
/* if we weren't given a directive either way, then we will error out
@@ -368,6 +369,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
368369
* properly set
369370
*/
370371
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
372+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
371373
/* check for permission */
372374
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
373375
/* if we weren't given a directive either way, then we will error out
@@ -420,6 +422,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
420422
* properly set
421423
*/
422424
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
425+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
423426
}
424427
if (nprocs_mapped == app->num_procs) {
425428
/* we are done */
@@ -588,6 +591,7 @@ int orte_rmaps_rr_byobj(orte_job_t *jdata,
588591
* properly set
589592
*/
590593
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
594+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
591595
/* check for permission */
592596
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
593597
/* if we weren't given a directive either way, then we will error out
@@ -745,6 +749,7 @@ static int byobj_span(orte_job_t *jdata,
745749
* properly set
746750
*/
747751
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
752+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
748753
}
749754
if (nprocs_mapped == app->num_procs) {
750755
/* we are done */

orte/mca/rmaps/seq/rmaps_seq.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata)
412412
* properly set
413413
*/
414414
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
415+
ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
415416
/* check for permission */
416417
if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
417418
/* if we weren't given a directive either way, then we will error out

orte/util/attr.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ typedef uint16_t orte_job_flags_t;
8888
#define ORTE_JOB_FLAG_RECOVERABLE 0x0100 // job is recoverable
8989
#define ORTE_JOB_FLAG_RESTART 0x0200 //
9090
#define ORTE_JOB_FLAG_PROCS_MIGRATING 0x0400 // some procs in job are migrating from one node to another
91+
#define ORTE_JOB_FLAG_OVERSUBSCRIBED 0x0800 // at least one node in the job is oversubscribed
9192

9293

9394
/*** JOB ATTRIBUTE KEYS ***/

0 commit comments

Comments
 (0)