Skip to content

Commit 8f32a59

Browse files
authored
Merge pull request #6830 from rhc54/topic/dpm
Provide locality for all procs on node
2 parents 20dd06c + d202e10 commit 8f32a59

File tree

25 files changed

+482
-381
lines changed

25 files changed

+482
-381
lines changed

ompi/dpm/dpm.c

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
1616
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
1717
* reserved.
18-
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
18+
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
1919
* Copyright (c) 2014-2017 Research Organization for Information Science
2020
* and Technology (RIST). All rights reserved.
2121
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
@@ -406,9 +406,43 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
406406
goto exit;
407407
}
408408
if (0 < opal_list_get_size(&ilist)) {
409+
uint32_t *peer_ranks = NULL;
410+
int prn, nprn;
411+
char *val, *mycpuset;
412+
uint16_t u16;
413+
opal_process_name_t wildcard_rank;
409414
/* convert the list of new procs to a proc_t array */
410415
new_proc_list = (ompi_proc_t**)calloc(opal_list_get_size(&ilist),
411416
sizeof(ompi_proc_t *));
417+
/* get the list of local peers for the new procs */
418+
cd = (ompi_dpm_proct_caddy_t*)opal_list_get_first(&ilist);
419+
proc = cd->p;
420+
wildcard_rank.jobid = proc->super.proc_name.jobid;
421+
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
422+
/* retrieve the local peers */
423+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCAL_PEERS,
424+
&wildcard_rank, &val, OPAL_STRING);
425+
if (OPAL_SUCCESS == rc && NULL != val) {
426+
char **peers = opal_argv_split(val, ',');
427+
free(val);
428+
nprn = opal_argv_count(peers);
429+
peer_ranks = (uint32_t*)calloc(nprn, sizeof(uint32_t));
430+
for (prn = 0; NULL != peers[prn]; prn++) {
431+
peer_ranks[prn] = strtoul(peers[prn], NULL, 10);
432+
}
433+
opal_argv_free(peers);
434+
}
435+
436+
/* get my locality string */
437+
val = NULL;
438+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING,
439+
OMPI_PROC_MY_NAME, &val, OPAL_STRING);
440+
if (OPAL_SUCCESS == rc && NULL != val) {
441+
mycpuset = val;
442+
} else {
443+
mycpuset = NULL;
444+
}
445+
412446
i = 0;
413447
OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) {
414448
opal_value_t *kv;
@@ -418,15 +452,38 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
418452
* OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without
419453
* them, we are just fine */
420454
ompi_proc_complete_init_single(proc);
421-
/* save the locality for later */
422-
kv = OBJ_NEW(opal_value_t);
423-
kv->key = strdup(OPAL_PMIX_LOCALITY);
424-
kv->type = OPAL_UINT16;
425-
kv->data.uint16 = proc->super.proc_flags;
426-
opal_pmix.store_local(&proc->super.proc_name, kv);
427-
OBJ_RELEASE(kv); // maintain accounting
455+
/* if this proc is local, then get its locality */
456+
if (NULL != peer_ranks) {
457+
for (prn=0; prn < nprn; prn++) {
458+
if (peer_ranks[prn] == proc->super.proc_name.vpid) {
459+
/* get their locality string */
460+
val = NULL;
461+
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, OPAL_PMIX_LOCALITY_STRING,
462+
&proc->super.proc_name, &val, OPAL_STRING);
463+
if (OPAL_SUCCESS == rc && NULL != val) {
464+
u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
465+
free(val);
466+
} else {
467+
/* all we can say is that it shares our node */
468+
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
469+
}
470+
proc->super.proc_flags = u16;
471+
/* save the locality for later */
472+
kv = OBJ_NEW(opal_value_t);
473+
kv->key = strdup(OPAL_PMIX_LOCALITY);
474+
kv->type = OPAL_UINT16;
475+
kv->data.uint16 = proc->super.proc_flags;
476+
opal_pmix.store_local(&proc->super.proc_name, kv);
477+
OBJ_RELEASE(kv); // maintain accounting
478+
break;
479+
}
480+
}
481+
}
428482
++i;
429483
}
484+
if (NULL != mycpuset) {
485+
free(mycpuset);
486+
}
430487
/* call add_procs on the new ones */
431488
rc = MCA_PML_CALL(add_procs(new_proc_list, opal_list_get_size(&ilist)));
432489
free(new_proc_list);

opal/mca/pmix/pmix4x/pmix/VERSION

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@ release=0
2323
# The only requirement is that it must be entirely printable ASCII
2424
# characters and have no white space.
2525

26-
greek=a1
26+
greek=
2727

2828
# If repo_rev is empty, then the repository version number will be
2929
# obtained during "make dist" via the "git describe --tags --always"
3030
# command, or with the date (if "git describe" fails) in the form of
3131
# "date<date>".
3232

33-
repo_rev=git03a8b5da
33+
repo_rev=git628a724c
3434

3535
# If tarball_version is not empty, it is used as the version string in
3636
# the tarball filename, regardless of all other versions listed in
@@ -44,7 +44,7 @@ tarball_version=
4444

4545
# The date when this release was created
4646

47-
date="Jul 16, 2019"
47+
date="Jul 21, 2019"
4848

4949
# The shared library version of each of PMIx's public libraries.
5050
# These versions are maintained in accordance with the "Library

opal/mca/pmix/pmix4x/pmix/contrib/pmix.spec

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved.
1313
# Copyright (c) 2013 Mellanox Technologies, Inc.
1414
# All rights reserved.
15-
# Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
15+
# Copyright (c) 2015-2019 Intel, Inc. All rights reserved.
1616
# Copyright (c) 2015 Research Organization for Information Science
1717
# and Technology (RIST). All rights reserved.
1818
# $COPYRIGHT$
@@ -192,7 +192,7 @@
192192

193193
Summary: An extended/exascale implementation of PMI
194194
Name: %{?_name:%{_name}}%{!?_name:pmix}
195-
Version: 4.0.0a1
195+
Version: 4.0.0
196196
Release: 1%{?dist}
197197
License: BSD
198198
Group: Development/Libraries

opal/mca/pmix/pmix4x/pmix/src/client/pmi1.c

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved.
44
* Copyright (c) 2014-2019 Research Organization for Information Science
55
* and Technology (RIST). All rights reserved.
66
* Copyright (c) 2016 Mellanox Technologies, Inc.
@@ -83,7 +83,7 @@ PMIX_EXPORT int PMI_Init(int *spawned)
8383

8484
/* getting internal key requires special rank value */
8585
memcpy(&proc, &myproc, sizeof(myproc));
86-
proc.rank = PMIX_RANK_UNDEF;
86+
proc.rank = PMIX_RANK_WILDCARD;
8787

8888
/* set controlling parameters
8989
* PMIX_OPTIONAL - expect that these keys should be available on startup
@@ -392,8 +392,6 @@ PMIX_EXPORT int PMI_Get_appnum(int *appnum)
392392
pmix_value_t *val;
393393
pmix_info_t info[1];
394394
bool val_optinal = 1;
395-
pmix_proc_t proc = myproc;
396-
proc.rank = PMIX_RANK_WILDCARD;
397395

398396
PMI_CHECK();
399397

@@ -412,11 +410,11 @@ PMIX_EXPORT int PMI_Get_appnum(int *appnum)
412410
PMIX_INFO_CONSTRUCT(&info[0]);
413411
PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);
414412

415-
rc = PMIx_Get(&proc, PMIX_APPNUM, info, 1, &val);
413+
rc = PMIx_Get(&myproc, PMIX_APPNUM, info, 1, &val);
416414
if (PMIX_SUCCESS == rc) {
417415
rc = convert_int(appnum, val);
418416
PMIX_VALUE_RELEASE(val);
419-
} else if( PMIX_ERR_NOT_FOUND == rc ){
417+
} else {
420418
/* this is optional value, set to 0 */
421419
*appnum = 0;
422420
rc = PMIX_SUCCESS;
@@ -443,7 +441,7 @@ PMIX_EXPORT int PMI_Publish_name(const char service_name[], const char port[])
443441
}
444442

445443
/* pass the service/port */
446-
pmix_strncpy(info.key, service_name, PMIX_MAX_KEYLEN);
444+
pmix_strncpy(info.key, service_name, PMIX_MAX_KEYLEN);
447445
info.value.type = PMIX_STRING;
448446
info.value.data.string = (char*) port;
449447

@@ -495,7 +493,7 @@ PMIX_EXPORT int PMI_Lookup_name(const char service_name[], char port[])
495493
PMIX_PDATA_CONSTRUCT(&pdata);
496494

497495
/* pass the service */
498-
pmix_strncpy(pdata.key, service_name, PMIX_MAX_KEYLEN);
496+
pmix_strncpy(pdata.key, service_name, PMIX_MAX_KEYLEN);
499497

500498
/* PMI-1 doesn't want the nspace back */
501499
if (PMIX_SUCCESS != (rc = PMIx_Lookup(&pdata, 1, NULL, 0))) {
@@ -512,7 +510,7 @@ PMIX_EXPORT int PMI_Lookup_name(const char service_name[], char port[])
512510
* potential we could overrun it. As this feature
513511
* isn't widely supported in PMI-1, try being
514512
* conservative */
515-
pmix_strncpy(port, pdata.value.data.string, PMIX_MAX_KEYLEN);
513+
pmix_strncpy(port, pdata.value.data.string, PMIX_MAX_KEYLEN);
516514
PMIX_PDATA_DESTRUCT(&pdata);
517515

518516
return PMIX_SUCCESS;

opal/mca/pmix/pmix4x/pmix/src/client/pmix_client_fence.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2014-2019 Intel, Inc. All rights reserved.
44
* Copyright (c) 2014-2019 Research Organization for Information Science
55
* and Technology (RIST). All rights reserved.
66
* Copyright (c) 2014 Artem Y. Polyakov <[email protected]>.
@@ -72,7 +72,7 @@ PMIX_EXPORT pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs,
7272

7373
PMIX_ACQUIRE_THREAD(&pmix_global_lock);
7474

75-
pmix_output_verbose(2, pmix_globals.debug_output,
75+
pmix_output_verbose(2, pmix_client_globals.fence_output,
7676
"pmix: executing fence");
7777

7878
if (pmix_globals.init_cntr <= 0) {
@@ -105,7 +105,7 @@ PMIX_EXPORT pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs,
105105
rc = cb->status;
106106
PMIX_RELEASE(cb);
107107

108-
pmix_output_verbose(2, pmix_globals.debug_output,
108+
pmix_output_verbose(2, pmix_client_globals.fence_output,
109109
"pmix: fence released");
110110

111111
return rc;
@@ -124,7 +124,7 @@ PMIX_EXPORT pmix_status_t PMIx_Fence_nb(const pmix_proc_t procs[], size_t nprocs
124124

125125
PMIX_ACQUIRE_THREAD(&pmix_global_lock);
126126

127-
pmix_output_verbose(2, pmix_globals.debug_output,
127+
pmix_output_verbose(2, pmix_client_globals.fence_output,
128128
"pmix: fence_nb called");
129129

130130
if (pmix_globals.init_cntr <= 0) {
@@ -184,7 +184,7 @@ static pmix_status_t unpack_return(pmix_buffer_t *data)
184184
pmix_status_t ret;
185185
int32_t cnt;
186186

187-
pmix_output_verbose(2, pmix_globals.debug_output,
187+
pmix_output_verbose(2, pmix_client_globals.fence_output,
188188
"client:unpack fence called");
189189

190190
/* unpack the status code */
@@ -195,7 +195,7 @@ static pmix_status_t unpack_return(pmix_buffer_t *data)
195195
PMIX_ERROR_LOG(rc);
196196
return rc;
197197
}
198-
pmix_output_verbose(2, pmix_globals.debug_output,
198+
pmix_output_verbose(2, pmix_client_globals.fence_output,
199199
"client:unpack fence received status %d", ret);
200200
return ret;
201201
}
@@ -254,7 +254,7 @@ static void wait_cbfunc(struct pmix_peer_t *pr, pmix_ptl_hdr_t *hdr,
254254
pmix_cb_t *cb = (pmix_cb_t*)cbdata;
255255
pmix_status_t rc;
256256

257-
pmix_output_verbose(2, pmix_globals.debug_output,
257+
pmix_output_verbose(2, pmix_client_globals.fence_output,
258258
"pmix: fence_nb callback recvd");
259259

260260
if (NULL == cb) {

0 commit comments

Comments
 (0)