Skip to content

Commit c704ed4

Browse files
authored
Merge pull request #7554 from rhc54/topic/proc1
ompi_proc_t size reduction: part 1
2 parents f9575ed + 33ab928 commit c704ed4

File tree

20 files changed

+148
-148
lines changed

20 files changed

+148
-148
lines changed

ompi/mca/bml/r2/bml_r2.c

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
1515
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
16-
* Copyright (c) 2013 Intel, Inc. All rights reserved
16+
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
1818
* Copyright (c) 2014 Research Organization for Information Science
1919
* and Technology (RIST). All rights reserved.
@@ -233,12 +233,14 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e
233233

234234
if (!bml_btl || bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity) {
235235
/* this btl has higher exclusivity than an existing btl or none exists */
236-
237-
opal_output_verbose(1, opal_btl_base_framework.framework_output,
238-
"mca: bml: Using %s btl for send to %s on node %s",
239-
btl->btl_component->btl_version.mca_component_name,
240-
OMPI_NAME_PRINT(&proc->super.proc_name),
241-
proc->super.proc_hostname);
236+
if (0 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
237+
char *errhost = opal_get_proc_hostname(&proc->super);
238+
opal_output(0, "mca: bml: Using %s btl for send to %s on node %s",
239+
btl->btl_component->btl_version.mca_component_name,
240+
OMPI_NAME_PRINT(&proc->super.proc_name),
241+
errhost);
242+
free(errhost);
243+
}
242244

243245
/* cache the endpoint on the proc */
244246
if (NULL == bml_btl || (bml_btl->btl->btl_exclusivity <= btl->btl_exclusivity)) {
@@ -252,15 +254,16 @@ static int mca_bml_r2_endpoint_add_btl (struct ompi_proc_t *proc, mca_bml_base_e
252254
* calculate the bitwise OR of the btl flags
253255
*/
254256
bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
255-
} else {
256-
opal_output_verbose(20, opal_btl_base_framework.framework_output,
257-
"mca: bml: Not using %s btl for send to %s on node %s "
258-
"because %s btl has higher exclusivity (%d > %d)",
259-
btl->btl_component->btl_version.mca_component_name,
260-
OMPI_NAME_PRINT(&proc->super.proc_name), proc->super.proc_hostname,
261-
bml_btl->btl->btl_component->btl_version.mca_component_name,
262-
bml_btl->btl->btl_exclusivity,
263-
btl->btl_exclusivity);
257+
} else if (19 < opal_output_get_verbosity(opal_btl_base_framework.framework_output)) {
258+
char *errhost = opal_get_proc_hostname(&proc->super);
259+
opal_output(0, "mca: bml: Not using %s btl for send to %s on node %s "
260+
"because %s btl has higher exclusivity (%d > %d)",
261+
btl->btl_component->btl_version.mca_component_name,
262+
OMPI_NAME_PRINT(&proc->super.proc_name), errhost,
263+
bml_btl->btl->btl_component->btl_version.mca_component_name,
264+
bml_btl->btl->btl_exclusivity,
265+
btl->btl_exclusivity);
266+
free(errhost);
264267
}
265268

266269
btl_in_use = true;
@@ -424,14 +427,16 @@ static int mca_bml_r2_add_proc (struct ompi_proc_t *proc)
424427
OBJ_RELEASE(bml_endpoint);
425428
/* no btl is available for this proc */
426429
if (mca_bml_r2.show_unreach_errors) {
430+
char *errhost = opal_get_proc_hostname(&proc->super);
431+
char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super);
427432
opal_show_help ("help-mca-bml-r2.txt", "unreachable proc", true,
428433
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
429-
(NULL != ompi_proc_local_proc->super.proc_hostname ?
430-
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
434+
localhost,
431435
OMPI_NAME_PRINT(&(proc->super.proc_name)),
432-
(NULL != proc->super.proc_hostname ?
433-
proc->super.proc_hostname : "unknown!"),
436+
errhost,
434437
btl_names);
438+
free(errhost);
439+
free(localhost);
435440
}
436441

437442
return OMPI_ERR_UNREACH;
@@ -578,14 +583,16 @@ static int mca_bml_r2_add_procs( size_t nprocs,
578583
if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
579584
ret = OMPI_ERR_UNREACH;
580585
if (mca_bml_r2.show_unreach_errors) {
586+
char *errhost = opal_get_proc_hostname(&proc->super);
587+
char *localhost = opal_get_proc_hostname(&ompi_proc_local_proc->super);
581588
opal_show_help("help-mca-bml-r2.txt", "unreachable proc", true,
582589
OMPI_NAME_PRINT(&(ompi_proc_local_proc->super.proc_name)),
583-
(NULL != ompi_proc_local_proc->super.proc_hostname ?
584-
ompi_proc_local_proc->super.proc_hostname : "unknown!"),
590+
localhost,
585591
OMPI_NAME_PRINT(&(proc->super.proc_name)),
586-
(NULL != proc->super.proc_hostname ?
587-
proc->super.proc_hostname : "unknown!"),
592+
errhost,
588593
btl_names);
594+
free(errhost);
595+
free(localhost);
589596
}
590597

591598
break;

ompi/mca/mtl/ofi/mtl_ofi.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
2+
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
33
*
44
* $COPYRIGHT$
55
*
@@ -98,10 +98,11 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
9898
(void**)&ep_name,
9999
&size);
100100
if (OMPI_SUCCESS != ret) {
101+
char *errhost = opal_get_proc_hostname(&procs[i]->super);
101102
opal_show_help("help-mtl-ofi.txt", "modex failed",
102103
true, ompi_process_info.nodename,
103-
procs[i]->super.proc_hostname,
104-
opal_strerror(ret), ret);
104+
errhost, opal_strerror(ret), ret);
105+
free(errhost);
105106
goto bail;
106107
}
107108
memcpy(&ep_names[i*namelen], ep_name, namelen);

ompi/mca/mtl/psm2/mtl_psm2.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,9 @@ ompi_mtl_psm2_add_procs(struct mca_mtl_base_module_t *mtl,
324324
errstr ? errstr : "unknown connect error");
325325
for (j = 0; j < (int) nprocs; j++) {
326326
if (errs_out[j] == thiserr) {
327-
opal_output(0, " %s", (NULL == procs[j]->super.proc_hostname) ?
328-
"unknown" : procs[j]->super.proc_hostname);
327+
char *errhost = opal_get_proc_hostname(&procs[j]->super);
328+
opal_output(0, " %s", errhost);
329+
free(errhost);
329330
}
330331
}
331332
opal_output(0, "\n");

ompi/mca/pml/base/pml_base_select.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
1414
* reserved.
15-
* Copyright (c) 2013-2019 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
1717
* $COPYRIGHT$
1818
*
@@ -370,12 +370,15 @@ mca_pml_base_pml_check_selected(const char *my_pml,
370370
/* if that module doesn't match my own, return an error */
371371
if ((size != strlen(my_pml) + 1) ||
372372
(0 != strcmp(my_pml, remote_pml))) {
373+
char *errhost = opal_get_proc_hostname(&procs[0]->super);
373374
opal_output(0, "%s selected pml %s, but peer %s on %s selected pml %s",
374375
OMPI_NAME_PRINT(&ompi_proc_local()->super.proc_name),
375376
my_pml, OMPI_NAME_PRINT(&procs[0]->super.proc_name),
376-
(NULL == procs[0]->super.proc_hostname) ? "unknown" : procs[0]->super.proc_hostname,
377+
errhost,
377378
remote_pml);
378-
free(remote_pml); /* cleanup before returning */
379+
free(remote_pml);
380+
free(errhost);
381+
/* cleanup before returning */
379382
return OMPI_ERR_UNREACH;
380383
}
381384

ompi/mca/pml/ob1/pml_ob1_cuda.c

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
1616
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
1717
* reserved.
18+
* Copyright (c) 2020 Intel, Inc. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -202,14 +203,17 @@ void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t
202203
/* Find the corresponding bml and adjust the flag to support CUDA get */
203204
for( i = 0; i < (int)ep->btl_send.arr_size; i++ ) {
204205
if( ep->btl_send.bml_btls[i].btl == btl ) {
206+
if (4 < opal_output_get_verbosity(btl_verbose_stream)) {
207+
char *errhost = opal_get_proc_hostname(&errproc->super);
208+
opal_output(0, "BTL %s: rank=%d enabling CUDA IPC "
209+
"to rank=%d on node=%s \n",
210+
btl->btl_component->btl_version.mca_component_name,
211+
OMPI_PROC_MY_NAME->vpid,
212+
((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
213+
errhost);
214+
free(errhost);
215+
}
205216
ep->btl_send.bml_btls[i].btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
206-
opal_output_verbose(5, btl_verbose_stream,
207-
"BTL %s: rank=%d enabling CUDA IPC "
208-
"to rank=%d on node=%s \n",
209-
btl->btl_component->btl_version.mca_component_name,
210-
OMPI_PROC_MY_NAME->vpid,
211-
((ompi_process_name_t*)&errproc->super.proc_name)->vpid,
212-
errproc->super.proc_hostname);
213217
}
214218
}
215219
}

ompi/proc/proc.c

Lines changed: 10 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,6 @@ void ompi_proc_destruct(ompi_proc_t* proc)
8686
* destroyed here. It will be destroyed later when the ompi_datatype_finalize is called.
8787
*/
8888
OBJ_RELEASE( proc->super.proc_convertor );
89-
if (NULL != proc->super.proc_hostname) {
90-
free(proc->super.proc_hostname);
91-
}
9289
opal_mutex_lock (&ompi_proc_lock);
9390
opal_list_remove_item(&ompi_proc_list, (opal_list_item_t*)proc);
9491
opal_hash_table_remove_value_ptr (&ompi_proc_hash, &proc->super.proc_name, sizeof (proc->super.proc_name));
@@ -135,22 +132,12 @@ static int ompi_proc_allocate (ompi_jobid_t jobid, ompi_vpid_t vpid, ompi_proc_t
135132
*/
136133
int ompi_proc_complete_init_single (ompi_proc_t *proc)
137134
{
138-
int ret;
139-
140135
if ((OMPI_CAST_RTE_NAME(&proc->super.proc_name)->jobid == OMPI_PROC_MY_NAME->jobid) &&
141136
(OMPI_CAST_RTE_NAME(&proc->super.proc_name)->vpid == OMPI_PROC_MY_NAME->vpid)) {
142137
/* nothing else to do */
143138
return OMPI_SUCCESS;
144139
}
145140

146-
/* we can retrieve the hostname at no cost because it
147-
* was provided at startup - but make it optional so
148-
* we don't chase after it if some system doesn't
149-
* provide it */
150-
proc->super.proc_hostname = NULL;
151-
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_HOSTNAME, &proc->super.proc_name,
152-
(char**)&(proc->super.proc_hostname), PMIX_STRING);
153-
154141
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
155142
/* get the remote architecture - this might force a modex except
156143
* for those environments where the RM provides it */
@@ -264,7 +251,6 @@ int ompi_proc_init(void)
264251
/* set local process data */
265252
ompi_proc_local_proc = proc;
266253
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
267-
proc->super.proc_hostname = strdup(ompi_process_info.nodename);
268254
proc->super.proc_arch = opal_local_arch;
269255
/* Register the local proc with OPAL */
270256
opal_proc_local_set(&proc->super);
@@ -609,7 +595,6 @@ int ompi_proc_refresh(void)
609595
if (i == OMPI_PROC_MY_NAME->vpid) {
610596
ompi_proc_local_proc = proc;
611597
proc->super.proc_flags = OPAL_PROC_ALL_LOCAL;
612-
proc->super.proc_hostname = ompi_process_info.nodename;
613598
proc->super.proc_arch = opal_local_arch;
614599
opal_proc_local_set(&proc->super);
615600
} else {
@@ -676,13 +661,6 @@ ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
676661
opal_mutex_unlock (&ompi_proc_lock);
677662
return rc;
678663
}
679-
/* pass the name of the host this proc is on */
680-
rc = opal_dss.pack(buf, &(proc->super.proc_hostname), 1, OPAL_STRING);
681-
if(rc != OPAL_SUCCESS) {
682-
OMPI_ERROR_LOG(rc);
683-
opal_mutex_unlock (&ompi_proc_lock);
684-
return rc;
685-
}
686664
}
687665
opal_mutex_unlock (&ompi_proc_lock);
688666
return OMPI_SUCCESS;
@@ -747,10 +725,10 @@ ompi_proc_unpack(opal_buffer_t* buf,
747725
int32_t count=1;
748726
ompi_process_name_t new_name;
749727
uint32_t new_arch;
750-
char *new_hostname;
751728
bool isnew = false;
752729
int rc;
753730
char *nspace;
731+
uint16_t u16, *u16ptr;
754732

755733
rc = opal_dss.unpack(buf, &new_name, &count, OMPI_NAME);
756734
if (rc != OPAL_SUCCESS) {
@@ -774,13 +752,6 @@ ompi_proc_unpack(opal_buffer_t* buf,
774752
free(newprocs);
775753
return rc;
776754
}
777-
rc = opal_dss.unpack(buf, &new_hostname, &count, OPAL_STRING);
778-
if (rc != OPAL_SUCCESS) {
779-
OMPI_ERROR_LOG(rc);
780-
free(plist);
781-
free(newprocs);
782-
return rc;
783-
}
784755
/* see if this proc is already on our ompi_proc_list */
785756
plist[i] = ompi_proc_find_and_add(&new_name, &isnew);
786757
if (isnew) {
@@ -798,27 +769,25 @@ ompi_proc_unpack(opal_buffer_t* buf,
798769
OBJ_RELEASE(plist[i]->super.proc_convertor);
799770
plist[i]->super.proc_convertor = opal_convertor_create(plist[i]->super.proc_arch, 0);
800771
#else
772+
char *errhost = opal_get_proc_hostname(&plist[i]->super);
801773
opal_show_help("help-mpi-runtime.txt",
802774
"heterogeneous-support-unavailable",
803775
true, ompi_process_info.nodename,
804-
new_hostname == NULL ? "<hostname unavailable>" :
805-
new_hostname);
776+
errhost);
806777
free(plist);
807778
free(newprocs);
779+
free(errhost);
808780
return OMPI_ERR_NOT_SUPPORTED;
809781
#endif
810782
}
811783

812-
if (NULL != new_hostname) {
813-
if (0 == strcmp(ompi_proc_local_proc->super.proc_hostname, new_hostname)) {
814-
plist[i]->super.proc_flags |= (OPAL_PROC_ON_NODE | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER);
815-
}
816-
817-
/* Save the hostname */
818-
plist[i]->super.proc_hostname = new_hostname;
784+
/* get the locality information - all RTEs are required
785+
* to provide this information at startup */
786+
u16ptr = &u16;
787+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY, &plist[i]->super.proc_name, &u16ptr, PMIX_UINT16);
788+
if (OPAL_SUCCESS == rc) {
789+
plist[i]->super.proc_flags = u16;
819790
}
820-
} else if (NULL != new_hostname) {
821-
free(new_hostname);
822791
}
823792
}
824793

opal/mca/btl/base/btl_base_error.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
1414
* Copyright (c) 2012 Los Alamos National Security, LLC.
1515
* All rights reserved.
16-
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved
16+
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
1717
* $COPYRIGHT$
1818
*
1919
* Additional copyrights may follow
@@ -59,13 +59,15 @@ OPAL_DECLSPEC extern int mca_btl_base_out(const char*, ...) __opal_attribute_for
5959

6060
#define BTL_PEER_ERROR(proc, args) \
6161
do { \
62+
char *errhost; \
6263
mca_btl_base_err("%s[%s:%d:%s] from %s ", \
6364
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
6465
__FILE__, __LINE__, __func__, \
6566
opal_process_info.nodename); \
6667
if (proc) { \
67-
mca_btl_base_err("to: %s ", \
68-
opal_get_proc_hostname(proc)); \
68+
errhost = opal_get_proc_hostname(proc); \
69+
mca_btl_base_err("to: %s ", errhost); \
70+
free(errhost); \
6971
} \
7072
mca_btl_base_err args; \
7173
mca_btl_base_err("\n"); \

opal/mca/btl/tcp/btl_tcp_frag.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2015-2017 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved.
18+
* Copyright (c) 2020 Intel, Inc. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -44,6 +45,7 @@
4445

4546
#include "opal/opal_socket_errno.h"
4647
#include "opal/mca/btl/base/btl_base_error.h"
48+
#include "opal/util/proc.h"
4749
#include "opal/util/show_help.h"
4850

4951
#include "btl_tcp_frag.h"
@@ -168,6 +170,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
168170
mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint;
169171
ssize_t cnt;
170172
int32_t i, num_vecs, dont_copy_data = 0;
173+
char *errhost;
171174

172175
repeat:
173176
num_vecs = frag->iov_cnt;
@@ -231,10 +234,11 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
231234
return false;
232235

233236
case ECONNRESET:
237+
errhost = opal_get_proc_hostname(btl_endpoint->endpoint_proc->proc_opal);
234238
opal_show_help("help-mpi-btl-tcp.txt", "peer hung up",
235239
true, opal_process_info.nodename,
236-
getpid(),
237-
btl_endpoint->endpoint_proc->proc_opal->proc_hostname);
240+
getpid(), errhost);
241+
free(errhost);
238242
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
239243
mca_btl_tcp_endpoint_close(btl_endpoint);
240244
return false;

0 commit comments

Comments
 (0)