Skip to content

Commit f92acd7

Browse files
author
Ralph Castain
authored
Merge pull request #4965 from rhc54/topic/rank
Fix breakage in ranking system and silence OSC/RDMA warnings
2 parents 1c75aa8 + d644f7e commit f92acd7

File tree

12 files changed

+185
-109
lines changed

12 files changed

+185
-109
lines changed

NEWS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
1919
Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
2020
Copyright (c) 2012 University of Houston. All rights reserved.
2121
Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
22-
Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
22+
Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
2323
Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights
2424
reserved.
2525
$COPYRIGHT$
@@ -71,6 +71,7 @@ Master (not on release branches yet)
7171
- Remove IB XRC support from the OpenIB BTL due to lack of support.
7272
- Remove support for big endian PowerPC.
7373
- Remove support for XL compilers older than v13.1
74+
- Fix rank-by algorithms to properly rank by object and span
7475

7576
3.0.0 -- September, 2017
7677
------------------------

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* reserved.
1313
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
15-
* Copyright (c) 2016 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -568,7 +568,7 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
568568

569569
OPAL_THREAD_SCOPED_LOCK(&sync->lock,
570570
OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) {
571-
fprintf (stderr, "Flushing aggregation %p, peeer %p\n", aggregation, aggregation->peer);
571+
fprintf (stderr, "Flushing aggregation %p, peer %p\n", (void*)aggregation, (void*)aggregation->peer);
572572
ompi_osc_rdma_peer_aggregate_flush (aggregation->peer);
573573
});
574574
}

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* reserved.
55
* Copyright (c) 2016-2017 Research Organization for Information Science
66
* and Technology (RIST). All rights reserved.
7-
* Copyright (c) 2016 Intel, Inc. All rights reserved.
7+
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
88
* $COPYRIGHT$
99
*
1010
* Additional copyrights may follow
@@ -50,6 +50,7 @@ struct ompi_osc_rdma_event_t {
5050

5151
typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t;
5252

53+
#if 0
5354
static void *ompi_osc_rdma_event_put (int fd, int flags, void *context)
5455
{
5556
ompi_osc_rdma_event_t *event = (ompi_osc_rdma_event_t *) context;
@@ -112,7 +113,7 @@ static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t *module, struct mca
112113

113114
return OMPI_SUCCESS;
114115
}
115-
116+
#endif
116117

117118
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
118119
void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
@@ -188,10 +189,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
188189
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
189190
{
190191
ompi_osc_rdma_module_t *module = sync->module;
191-
const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
192192
unsigned long len = target_count * target_datatype->super.size;
193-
ompi_osc_rdma_frag_t *frag = NULL;
194-
volatile bool complete = false;
195193
char *ptr = NULL;
196194
int ret;
197195

@@ -523,7 +521,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
523521
ompi_osc_rdma_module_t *module = sync->module;
524522
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
525523
int ret, btl_op, flags;
526-
int64_t origin, result;
524+
int64_t origin;
527525

528526
if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
529527
(!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
@@ -590,13 +588,13 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
590588
new_value = old_value;
591589

592590
if (&ompi_mpi_op_replace.op == op) {
593-
memcpy ((void *)((intptr_t) &new_value) + offset, origin_addr, extent);
591+
memcpy ((void *)((intptr_t) &new_value + offset), origin_addr, extent);
594592
} else if (&ompi_mpi_op_no_op.op != op) {
595-
ompi_op_reduce (op, (void *) origin_addr, (void *)((intptr_t) &new_value) + offset, 1, dt);
593+
ompi_op_reduce (op, (void *) origin_addr, (void*)((intptr_t) &new_value + offset), 1, dt);
596594
}
597595

598596
ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
599-
old_value, new_value, 0, &new_value);
597+
old_value, new_value, 0, (int64_t*)&new_value);
600598
if (OPAL_SUCCESS != ret || new_value == old_value) {
601599
break;
602600
}
@@ -605,7 +603,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
605603
} while (1);
606604

607605
if (result_addr) {
608-
memcpy (result_addr, (void *)((intptr_t) &new_value) + offset, extent);
606+
memcpy (result_addr, (void *)((intptr_t) &new_value + offset), extent);
609607
}
610608

611609
if (OPAL_SUCCESS == ret) {
@@ -696,11 +694,9 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
696694
mca_btl_base_registration_handle_t *target_handle, bool lock_acquired)
697695
{
698696
ompi_osc_rdma_module_t *module = sync->module;
699-
const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment);
700-
unsigned long offset, aligned_len, len = datatype->super.size;
697+
unsigned long len = datatype->super.size;
701698
mca_btl_base_registration_handle_t *local_handle = NULL;
702699
ompi_osc_rdma_frag_t *frag = NULL;
703-
ompi_osc_rdma_request_t *request;
704700
volatile bool complete = false;
705701
/* drop the const. this code will not attempt to change the value */
706702
char *ptr = (char *) source_addr;

ompi/mca/osc/rdma/osc_rdma_active_target.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* Copyright (c) 2017 The University of Tennessee and The University
1717
* of Tennessee Research Foundation. All rights
1818
* reserved.
19-
* Copyright (c) 2017 Intel, Inc. All rights reserved.
19+
* Copyright (c) 2017-2018 Intel, Inc. All rights reserved.
2020
* $COPYRIGHT$
2121
*
2222
* Additional copyrights may follow
@@ -80,7 +80,7 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b
8080
{
8181
ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context;
8282

83-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", pending_op, status);
83+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status);
8484

8585
if (pending_op->op_result) {
8686
memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size);
@@ -296,7 +296,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
296296
{
297297
ompi_osc_rdma_module_t *module = GET_MODULE(win);
298298
ompi_osc_rdma_peer_t **peers;
299-
int my_rank = ompi_comm_rank (module->comm);
300299
ompi_osc_rdma_state_t *state = module->state;
301300
int ret = OMPI_SUCCESS;
302301

ompi/mca/osc/rdma/osc_rdma_comm.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
/*
33
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2016 Intel, Inc. All rights reserved.
5+
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
66
* Copyright (c) 2017 Research Organization for Information Science
77
* and Technology (RIST). All rights reserved.
88
* Copyright (c) 2017 IBM Corporation. All rights reserved.
@@ -492,6 +492,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee
492492
return ret;
493493
}
494494

495+
#if 0
495496
static void ompi_osc_rdma_aggregate_append (ompi_osc_rdma_aggregation_t *aggregation, ompi_osc_rdma_request_t *request,
496497
void *source_buffer, size_t size)
497498
{
@@ -550,13 +551,16 @@ static int ompi_osc_rdma_aggregate_alloc (ompi_osc_rdma_sync_t *sync, ompi_osc_r
550551

551552
return OMPI_SUCCESS;
552553
}
554+
#endif
553555

554556
int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
555557
mca_btl_base_registration_handle_t *target_handle, void *source_buffer, size_t size,
556558
ompi_osc_rdma_request_t *request)
557559
{
558560
ompi_osc_rdma_module_t *module = sync->module;
561+
#if 0
559562
ompi_osc_rdma_aggregation_t *aggregation = peer->aggregate;
563+
#endif
560564
mca_btl_base_registration_handle_t *local_handle = NULL;
561565
mca_btl_base_rdma_completion_fn_t cbfunc = NULL;
562566
ompi_osc_rdma_frag_t *frag = NULL;

ompi/mca/osc/rdma/osc_rdma_passive_target.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* reserved.
1313
* Copyright (c) 2010 IBM Corporation. All rights reserved.
1414
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
15+
* Copyright (c) 2018 Intel, Inc. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -202,7 +203,7 @@ int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdm
202203
} while (0);
203204
);
204205

205-
return OMPI_SUCCESS;
206+
return ret;
206207
}
207208

208209
int ompi_osc_rdma_lock_atomic (int lock_type, int target, int assert, ompi_win_t *win)

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,9 +190,17 @@ void orte_plm_base_allocation_complete(int fd, short args, void *cbdata)
190190

191191
ORTE_ACQUIRE_OBJECT(caddy);
192192

193-
/* move the state machine along */
194-
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
195-
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
193+
/* if we don't want to launch, then we at least want
194+
* to map so we can see where the procs would have
195+
* gone - so skip to the mapping state */
196+
if (orte_do_not_launch) {
197+
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
198+
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_MAP);
199+
} else {
200+
/* move the state machine along */
201+
caddy->jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE;
202+
ORTE_ACTIVATE_JOB_STATE(caddy->jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS);
203+
}
196204

197205
/* cleanup */
198206
OBJ_RELEASE(caddy);

orte/mca/ras/base/ras_base_node.c

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
1313
* reserved.
14-
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* $COPYRIGHT$
@@ -50,6 +50,8 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
5050
bool hnp_alone = true, skiphnp = false;
5151
orte_attribute_t *kv;
5252
char **alias=NULL, **nalias;
53+
orte_proc_t *daemon;
54+
orte_job_t *djob;
5355

5456
/* get the number of nodes */
5557
num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes);
@@ -76,6 +78,9 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
7678
return rc;
7779
}
7880

81+
/* if we are not launching, get the daemon job */
82+
djob = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
83+
7984
/* get the hnp node's info */
8085
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
8186

@@ -189,6 +194,21 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
189194
ORTE_ERROR_LOG(rc);
190195
return rc;
191196
}
197+
if (orte_do_not_launch) {
198+
/* create a daemon for this node since we won't be launching
199+
* and the mapper needs to see a daemon - this is used solely
200+
* for testing the mappers */
201+
daemon = OBJ_NEW(orte_proc_t);
202+
daemon->name.jobid = ORTE_PROC_MY_NAME->jobid;
203+
daemon->name.vpid = node->index;
204+
daemon->state = ORTE_PROC_STATE_RUNNING;
205+
OBJ_RETAIN(node);
206+
daemon->node = node;
207+
opal_pointer_array_set_item(djob->procs, daemon->name.vpid, daemon);
208+
djob->num_procs++;
209+
OBJ_RETAIN(daemon);
210+
node->daemon = daemon;
211+
}
192212
/* update the total slots in the job */
193213
orte_ras_base.total_slots_alloc += node->slots;
194214
/* check if we have fqdn names in the allocation */

orte/mca/ras/simulator/ras_sim_module.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
44
* Copyright (c) 2015-2017 Research Organization for Information Science
55
* and Technology (RIST). All rights reserved.
6-
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
6+
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
77
*
88
* $COPYRIGHT$
99
*
@@ -23,6 +23,7 @@
2323
#include "opal/mca/hwloc/hwloc-internal.h"
2424
#include "opal/util/argv.h"
2525

26+
#include "orte/mca/errmgr/errmgr.h"
2627
#include "orte/util/show_help.h"
2728
#include "orte/runtime/orte_globals.h"
2829

@@ -179,6 +180,10 @@ static int allocate(orte_job_t *jdata, opal_list_t *nodes)
179180
support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
180181
support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
181182
support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
183+
/* pass it thru the filter so we create the summaries required by the mappers */
184+
if (OPAL_SUCCESS != opal_hwloc_base_filter_cpus(topo)) {
185+
ORTE_ERROR_LOG(ORTE_ERROR);
186+
}
182187
/* add it to our array */
183188
t = OBJ_NEW(orte_topology_t);
184189
t->topo = topo;

orte/mca/rmaps/base/rmaps_base_binding.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
1414
* All rights reserved.
15-
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2015-2017 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
1818
* $COPYRIGHT$
@@ -246,7 +246,7 @@ static int bind_downwards(orte_job_t *jdata,
246246
hwloc_obj_type_t target,
247247
unsigned cache_level)
248248
{
249-
int j;
249+
int j, rc;
250250
orte_job_map_t *map;
251251
orte_proc_t *proc;
252252
hwloc_obj_t trg_obj, nxt_obj;
@@ -367,7 +367,10 @@ static int bind_downwards(orte_job_t *jdata,
367367
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
368368
ORTE_NAME_PRINT(&proc->name), node->name);
369369
} else {
370-
opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology->topo, totalcpuset);
370+
rc = opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology->topo, totalcpuset);
371+
if (OPAL_SUCCESS != rc) {
372+
ORTE_ERROR_LOG(rc);
373+
}
371374
opal_output(orte_rmaps_base_framework.framework_output,
372375
"%s BOUND PROC %s[%s] TO %s: %s",
373376
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@@ -841,7 +844,8 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
841844
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
842845
continue;
843846
}
844-
if (!orte_no_vm && (int)ORTE_PROC_MY_NAME->vpid != node->index) {
847+
if (!orte_no_vm && !orte_do_not_launch &&
848+
(int)ORTE_PROC_MY_NAME->vpid != node->index) {
845849
continue;
846850
}
847851
if (!orte_do_not_launch) {

0 commit comments

Comments
 (0)