Skip to content

Commit d387725

Browse files
author
rhc54
committed
Merge pull request #1270 from rhc54/topic/cleanup
Cleanup connection termination
2 parents a04f1cd + 0a6b8d2 commit d387725

File tree

4 files changed

+118
-68
lines changed

4 files changed

+118
-68
lines changed

opal/mca/pmix/pmix120/pmix/src/server/pmix_server.c

Lines changed: 57 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,30 +1056,55 @@ static bool match_error_registration(pmix_regevents_info_t *reginfoptr, pmix_not
10561056
size_t ninfo = reginfoptr->ninfo;
10571057
pmix_status_t error = cd->status;
10581058

1059+
if (NULL == info || ninfo <= 0) {
1060+
/* this is a general errhandler, and so it always matches.
1061+
* however, here we are looking for an exact match, and
1062+
* so we ignore general errhandlers unless the incoming
1063+
* one is also general */
1064+
if (NULL == cd->info || 0 == cd->ninfo) {
1065+
return true;
1066+
} else {
1067+
return false;
1068+
}
1069+
}
1070+
1071+
/* since this errhandler has info keys, it is not a general errhandler.
1072+
* If the incoming errhandler *is* a general one, then we must not
1073+
* match so we can store the general case */
1074+
if (NULL == cd->info || 0 == cd->ninfo) {
1075+
return false;
1076+
}
1077+
1078+
/* try to match using error name or error group keys - this indicates
1079+
* a request for a specific error state */
10591080
pmix_get_errorgroup(error, errgroup);
1060-
/* try to match using error name or error group keys */
10611081
for (i=0; i < ninfo; i++) {
10621082
// if we get a match on any key then we abort the search and return true.
1063-
if ((0 == strcmp(info[i].key, PMIX_ERROR_NAME)) &&
1083+
if ((0 == strncmp(info[i].key, PMIX_ERROR_NAME, PMIX_MAX_KEYLEN)) &&
10641084
(error == info[i].value.data.int32)) {
10651085
return true;
1066-
} else if ((0 == strcmp(info[i].key, errgroup)) &&
1086+
} else if ((0 == strncmp(info[i].key, errgroup, PMIX_MAX_KEYLEN)) &&
10671087
(true == info[i].value.data.flag)) {
10681088
return true;
10691089
}
10701090
}
1071-
/* search by node (error location) key if it is specified in the notify info list*/
1072-
for (i=0; i<cd->ninfo ; i++) {
1073-
if (0 == strcmp (cd->info[i].key, PMIX_ERROR_NODE_NAME)) {
1074-
for (j=0; j<ninfo; j++) {
1075-
if ((0 == strcmp (info[j].key, PMIX_ERROR_NODE_NAME)) &&
1076-
(0 == strcmp (info[j].value.data.string, cd->info[i].value.data.string))) {
1091+
1092+
/* if we get here, then they haven't asked for a specific error state.
1093+
* It is possible, however, that they are asking for all errors from a
1094+
* specific node, so search by node (error location) key if it is
1095+
* specified in the notify info list */
1096+
for (i=0; i < cd->ninfo ; i++) {
1097+
if (0 == strncmp(cd->info[i].key, PMIX_ERROR_NODE_NAME, PMIX_MAX_KEYLEN)) {
1098+
for (j=0; j < ninfo; j++) {
1099+
if ((0 == strncmp(info[j].key, PMIX_ERROR_NODE_NAME, PMIX_MAX_KEYLEN)) &&
1100+
(0 == strcmp(info[j].value.data.string, cd->info[i].value.data.string))) {
10771101
return true;
10781102
}
10791103
}
10801104
}
10811105
}
1082-
/* end of search return false*/
1106+
1107+
/* end of search and nothing matched, so return false */
10831108
return false;
10841109
}
10851110

@@ -1093,9 +1118,11 @@ static void _notify_error(int sd, short args, void *cbdata)
10931118
pmix_peer_t *peer;
10941119
pmix_regevents_info_t *reginfoptr;
10951120
bool notify, notifyall;
1121+
10961122
pmix_output_verbose(0, pmix_globals.debug_output,
10971123
"pmix_server: _notify_error notifying client of error %d",
10981124
cd->status);
1125+
10991126
/* pack the command */
11001127
if (PMIX_SUCCESS != (rc = pmix_bfrop.pack(cd->buf, &cmd, 1, PMIX_CMD))) {
11011128
PMIX_ERROR_LOG(rc);
@@ -1157,6 +1184,8 @@ static void _notify_error(int sd, short args, void *cbdata)
11571184
}
11581185
}
11591186
if (!notify) {
1187+
/* if we are not notifying everyone, and this proc isn't to
1188+
* be notified, then just continue the main loop */
11601189
continue;
11611190
}
11621191
}
@@ -1173,8 +1202,9 @@ static void _notify_error(int sd, short args, void *cbdata)
11731202
pmix_output_verbose(2, pmix_globals.debug_output,
11741203
"pmix_server _notify_error - match error registration returned notify =%d ", notify);
11751204
}
1176-
if (notify)
1205+
if (notify) {
11771206
break;
1207+
}
11781208
}
11791209
if (notify) {
11801210
pmix_output_verbose(2, pmix_globals.debug_output,
@@ -1212,6 +1242,7 @@ pmix_status_t pmix_server_notify_error(pmix_status_t status,
12121242
cd->ninfo = ninfo;
12131243
cd->cbfunc = cbfunc;
12141244
cd->cbdata = cbdata;
1245+
12151246
pmix_output_verbose(2, pmix_globals.debug_output,
12161247
"pmix_server_notify_error status =%d, nprocs = %lu, ninfo =%lu",
12171248
status, nprocs, ninfo);
@@ -1227,18 +1258,19 @@ static void reg_errhandler(int sd, short args, void *cbdata)
12271258
int index = 0;
12281259
pmix_status_t rc;
12291260
pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata;
1261+
12301262
/* check if this handler is already registered if so return error */
1231-
if (PMIX_SUCCESS == pmix_lookup_errhandler (cd->err, &index)) {
1263+
if (PMIX_SUCCESS == pmix_lookup_errhandler(cd->err, &index)) {
12321264
/* complete request with error status and return its original reference */
12331265
pmix_output_verbose(2, pmix_globals.debug_output,
12341266
"pmix_server_register_errhandler error - hdlr already registered index = %d",
12351267
index);
1236-
cd->cbfunc.errregcbfn (PMIX_EXISTS, index, cd->cbdata);
1268+
cd->cbfunc.errregcbfn(PMIX_EXISTS, index, cd->cbdata);
12371269
} else {
1238-
rc = pmix_add_errhandler (cd->err, cd->info, cd->ninfo, &index);
1270+
rc = pmix_add_errhandler(cd->err, cd->info, cd->ninfo, &index);
12391271
pmix_output_verbose(2, pmix_globals.debug_output,
12401272
"pmix_server_register_errhandler - success index =%d", index);
1241-
cd->cbfunc.errregcbfn (rc, index, cd->cbdata);
1273+
cd->cbfunc.errregcbfn(rc, index, cd->cbdata);
12421274
}
12431275
cd->active = false;
12441276
PMIX_RELEASE(cd);
@@ -1250,24 +1282,30 @@ void pmix_server_register_errhandler(pmix_info_t info[], size_t ninfo,
12501282
void *cbdata)
12511283
{
12521284
pmix_shift_caddy_t *cd;
1285+
12531286
/* need to thread shift this request */
12541287
cd = PMIX_NEW(pmix_shift_caddy_t);
12551288
cd->info = info;
12561289
cd->ninfo = ninfo;
12571290
cd->err = errhandler;
12581291
cd->cbfunc.errregcbfn = cbfunc;
12591292
cd->cbdata = cbdata;
1293+
12601294
pmix_output_verbose(2, pmix_globals.debug_output,
12611295
"pmix_server_register_errhandler shifting to server thread");
1296+
12621297
PMIX_THREADSHIFT(cd, reg_errhandler);
12631298
}
12641299

12651300
static void dereg_errhandler(int sd, short args, void *cbdata)
12661301
{
12671302
pmix_status_t rc;
12681303
pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata;
1269-
rc = pmix_remove_errhandler (cd->ref);
1270-
cd->cbfunc.opcbfn(rc, cd->cbdata);
1304+
1305+
rc = pmix_remove_errhandler(cd->ref);
1306+
if (NULL != cd->cbfunc.opcbfn) {
1307+
cd->cbfunc.opcbfn(rc, cd->cbdata);
1308+
}
12711309
cd->active = false;
12721310
}
12731311

@@ -1276,12 +1314,14 @@ void pmix_server_deregister_errhandler(int errhandler_ref,
12761314
void *cbdata)
12771315
{
12781316
pmix_shift_caddy_t *cd;
1317+
12791318
/* need to thread shift this request */
12801319
cd = PMIX_NEW(pmix_shift_caddy_t);
12811320
cd->cbfunc.opcbfn = cbfunc;
12821321
cd->cbdata = cbdata;
12831322
cd->ref = errhandler_ref;
12841323
PMIX_THREADSHIFT(cd, dereg_errhandler);
1324+
12851325
PMIX_WAIT_FOR_COMPLETION(cd->active);
12861326
PMIX_RELEASE(cd);
12871327
}

opal/mca/pmix/pmix120/pmix/src/util/error.c

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,10 @@ void pmix_errhandler_invoke(pmix_status_t status,
165165
PMIX_INFO_CREATE(iptr, ninfo+1);
166166
(void)strncpy(iptr[0].key, PMIX_ERROR_HANDLER_ID, PMIX_MAX_KEYLEN);
167167
iptr[0].value.type = PMIX_INT;
168-
for (j=0; j < ninfo; j++) {
169-
PMIX_INFO_LOAD(&iptr[j+1], info[j].key, &info[j].value.data, info[j].value.type);
168+
if (NULL != info) {
169+
for (j=0; j < ninfo; j++) {
170+
PMIX_INFO_LOAD(&iptr[j+1], info[j].key, &info[j].value.data, info[j].value.type);
171+
}
170172
}
171173

172174
for (i = 0; i < pmix_globals.errregs.size; i++) {
@@ -221,9 +223,10 @@ pmix_status_t pmix_lookup_errhandler(pmix_notification_fn_t err,
221223
int i;
222224
pmix_status_t rc = PMIX_ERR_NOT_FOUND;
223225
pmix_error_reg_info_t *errreg = NULL;
226+
224227
for (i = 0; i < pmix_pointer_array_get_size(&pmix_globals.errregs) ; i++) {
225-
errreg = (pmix_error_reg_info_t*) pmix_pointer_array_get_item (&pmix_globals.errregs, i);
226-
if((NULL != errreg) && (err == errreg->errhandler)) {
228+
errreg = (pmix_error_reg_info_t*)pmix_pointer_array_get_item(&pmix_globals.errregs, i);
229+
if ((NULL != errreg) && (err == errreg->errhandler)) {
227230
*index = i;
228231
rc = PMIX_SUCCESS;
229232
break;
@@ -238,36 +241,45 @@ pmix_status_t pmix_add_errhandler(pmix_notification_fn_t err,
238241
{
239242
int i;
240243
pmix_status_t rc = PMIX_SUCCESS;
241-
pmix_error_reg_info_t *errreg = PMIX_NEW(pmix_error_reg_info_t);
244+
pmix_error_reg_info_t *errreg;
245+
246+
errreg = PMIX_NEW(pmix_error_reg_info_t);
242247
errreg->errhandler = err;
243248
errreg->ninfo = ninfo;
244-
PMIX_INFO_CREATE(errreg->info, ninfo);
245-
for (i=0; i < ninfo; i++) {
246-
memcpy(errreg->info[i].key, info[i].key, PMIX_MAX_KEYLEN);
247-
pmix_value_xfer(&errreg->info[i].value, &info[i].value);
249+
if (NULL != info && 0 < ninfo) {
250+
PMIX_INFO_CREATE(errreg->info, ninfo);
251+
for (i=0; i < ninfo; i++) {
252+
(void)strncpy(errreg->info[i].key, info[i].key, PMIX_MAX_KEYLEN);
253+
pmix_value_xfer(&errreg->info[i].value, &info[i].value);
254+
}
248255
}
249-
*index = pmix_pointer_array_add (&pmix_globals.errregs, errreg);
256+
*index = pmix_pointer_array_add(&pmix_globals.errregs, errreg);
250257
pmix_output_verbose(2, pmix_globals.debug_output,
251258
"pmix_add_errhandler index =%d", *index);
252-
if (-1 == *index)
259+
if (*index < 0) {
260+
PMIX_RELEASE(errreg);
253261
rc = PMIX_ERROR;
262+
}
254263
return rc;
255264
}
256265

257266
pmix_status_t pmix_remove_errhandler(int errhandler_ref)
258267
{
259268
int rc = PMIX_SUCCESS;
260269
pmix_error_reg_info_t *errreg;
261-
errreg = (pmix_error_reg_info_t*) pmix_pointer_array_get_item (&pmix_globals.errregs,
262-
errhandler_ref);
263-
if (NULL != errreg)
270+
271+
errreg = (pmix_error_reg_info_t*)pmix_pointer_array_get_item(&pmix_globals.errregs,
272+
errhandler_ref);
273+
if (NULL != errreg) {
264274
PMIX_RELEASE(errreg);
265-
else
275+
pmix_pointer_array_set_item(&pmix_globals.errregs, errhandler_ref, NULL);
276+
} else {
266277
rc = PMIX_ERR_NOT_FOUND;
278+
}
267279
return rc;
268280
}
269281

270-
void pmix_get_errorgroup ( pmix_status_t status, char *pmix_error_group)
282+
void pmix_get_errorgroup(pmix_status_t status, char *pmix_error_group)
271283
{
272284
switch(status) {
273285
case PMIX_ERR_UNREACH:
@@ -276,24 +288,24 @@ void pmix_get_errorgroup ( pmix_status_t status, char *pmix_error_group)
276288
case PMIX_ERR_TIMEOUT:
277289
case PMIX_ERR_PACK_FAILURE:
278290
case PMIX_ERR_UNPACK_FAILURE:
279-
strcpy(pmix_error_group, PMIX_ERROR_GROUP_COMM);
291+
(void)strncpy(pmix_error_group, PMIX_ERROR_GROUP_COMM, PMIX_MAX_KEYLEN);
280292
break;
281293
case PMIX_ERR_OUT_OF_RESOURCE:
282294
case PMIX_ERR_RESOURCE_BUSY:
283295
case PMIX_ERR_NOMEM:
284-
strcpy(pmix_error_group, PMIX_ERROR_GROUP_RESOURCE);
296+
(void)strncpy(pmix_error_group, PMIX_ERROR_GROUP_RESOURCE, PMIX_MAX_KEYLEN);
285297
break;
286298
case PMIX_ERR_PROC_MIGRATE:
287299
case PMIX_ERR_PROC_CHECKPOINT:
288300
case PMIX_ERR_PROC_RESTART:
289-
strcpy(pmix_error_group, PMIX_ERROR_GROUP_MIGRATE);
301+
(void)strncpy(pmix_error_group, PMIX_ERROR_GROUP_MIGRATE, PMIX_MAX_KEYLEN);
290302
break;
291303
case PMIX_ERR_PROC_ABORTING:
292304
case PMIX_ERR_PROC_REQUESTED_ABORT:
293305
case PMIX_ERR_PROC_ABORTED:
294-
strcpy(pmix_error_group, PMIX_ERROR_GROUP_ABORT);
306+
(void)strncpy(pmix_error_group, PMIX_ERROR_GROUP_ABORT, PMIX_MAX_KEYLEN);
295307
break;
296308
default:
297-
strcpy(pmix_error_group, PMIX_ERROR_GROUP_GENERAL);
309+
(void)strncpy(pmix_error_group, PMIX_ERROR_GROUP_GENERAL, PMIX_MAX_KEYLEN);
298310
}
299311
}

orte/mca/oob/tcp/oob_tcp_component.c

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -970,27 +970,26 @@ void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata)
970970
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
971971
ORTE_NAME_PRINT(&pop->peer));
972972

973-
MCA_OOB_TCP_CHECK_SHUTDOWN(pop);
974-
975973
/* Mark that we no longer support this peer */
976974
memcpy(&ui64, (char*)&pop->peer, sizeof(uint64_t));
977975
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
978976
ui64, (void**)&bpr) || NULL == bpr) {
979977
bpr = OBJ_NEW(orte_oob_base_peer_t);
980978
}
981979
opal_bitmap_clear_bit(&bpr->addressable, mca_oob_tcp_component.super.idx);
982-
if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers,
980+
if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers,
983981
ui64, NULL))) {
984982
ORTE_ERROR_LOG(rc);
985983
}
986984

987-
/* activate the proc state */
988-
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
989-
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
990-
} else {
991-
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
985+
if (!orte_finalizing) {
986+
/* activate the proc state */
987+
if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) {
988+
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST);
989+
} else {
990+
ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED);
991+
}
992992
}
993-
994993
OBJ_RELEASE(pop);
995994
}
996995

@@ -1006,8 +1005,6 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
10061005
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
10071006
ORTE_NAME_PRINT(&mop->hop));
10081007

1009-
MCA_OOB_TCP_CHECK_SHUTDOWN(mop);
1010-
10111008
/* mark that we cannot reach this hop */
10121009
memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
10131010
if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers,
@@ -1020,11 +1017,16 @@ void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata)
10201017
ORTE_ERROR_LOG(rc);
10211018
}
10221019

1023-
/* if this was a lifeline, then alert */
1024-
if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
1025-
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
1026-
} else {
1027-
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
1020+
/* report the error back to the OOB and let it try other components
1021+
* or declare a problem
1022+
*/
1023+
if (!orte_finalizing && !orte_abnormal_term_ordered) {
1024+
/* if this was a lifeline, then alert */
1025+
if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) {
1026+
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST);
1027+
} else {
1028+
ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED);
1029+
}
10281030
}
10291031

10301032
OBJ_RELEASE(mop);
@@ -1042,7 +1044,11 @@ void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata)
10421044
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
10431045
ORTE_NAME_PRINT(&mop->hop));
10441046

1045-
MCA_OOB_TCP_CHECK_SHUTDOWN(mop);
1047+
if (orte_finalizing || orte_abnormal_term_ordered) {
1048+
/* just ignore the problem */
1049+
OBJ_RELEASE(mop);
1050+
return;
1051+
}
10461052

10471053
/* mark that this component cannot reach this hop */
10481054
memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t));
@@ -1110,7 +1116,11 @@ void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata)
11101116
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
11111117
ORTE_NAME_PRINT(&pop->peer));
11121118

1113-
MCA_OOB_TCP_CHECK_SHUTDOWN(pop);
1119+
/* if we are terminating, then don't attempt to reconnect */
1120+
if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) {
1121+
OBJ_RELEASE(pop);
1122+
return;
1123+
}
11141124

11151125
/* activate the proc state */
11161126
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,

0 commit comments

Comments
 (0)