Skip to content

Commit b7e9711

Browse files
author
Ralph Castain
committed
Resolve the direct modex race condition. The request hotel was running out of rooms, thereby returning an error upon checkin - and we had missed error_logging a couple of those places. Hence no error message and things just hung.
Output a (hopefully) helpful message when we timeout an operation Thanks to Nathan for tracking it down. Signed-off-by: Ralph Castain <[email protected]>
1 parent 9a69b20 commit b7e9711

File tree

4 files changed

+41
-5
lines changed

4 files changed

+41
-5
lines changed

orte/orted/help-orted.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
13+
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1414
# $COPYRIGHT$
1515
#
1616
# Additional copyrights may follow
@@ -60,3 +60,11 @@ info key:
6060
key: %s
6161

6262
The operation will continue, but may not behave completely as expected.
63+
#
64+
[timedout]
65+
A request has timed out and will therefore fail:
66+
67+
Operation: %s
68+
69+
Your job may terminate as a result of this problem. You may want to
70+
adjust the MCA parameter pmix_server_max_wait and try again.

orte/orted/pmix/pmix_server.c

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender,
8383
opal_buffer_t *buffer,
8484
orte_rml_tag_t tg, void *cbdata);
8585

86+
#define ORTE_PMIX_SERVER_MIN_ROOMS 4096
87+
8688
pmix_server_globals_t orte_pmix_server_globals = {0};
8789

8890
static opal_pmix_server_module_t pmix_server = {
@@ -122,7 +124,7 @@ void pmix_server_register_params(void)
122124
orte_pmix_server_globals.verbosity);
123125
}
124126
/* specify the size of the hotel */
125-
orte_pmix_server_globals.num_rooms = 256;
127+
orte_pmix_server_globals.num_rooms = -1;
126128
(void) mca_base_var_register ("orte", "pmix", NULL, "server_max_reqs",
127129
"Maximum number of backlogged PMIx server direct modex requests",
128130
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
@@ -158,7 +160,7 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
158160
{
159161
pmix_server_req_t *req = (pmix_server_req_t*)occupant;
160162
bool timeout = false;
161-
int rc;
163+
int rc=OPAL_ERR_TIMEOUT;
162164

163165
/* decrement the request timeout */
164166
req->timeout -= orte_pmix_server_globals.timeout;
@@ -175,6 +177,8 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
175177
}
176178
ORTE_ERROR_LOG(rc);
177179
/* fall thru and return an error so the caller doesn't hang */
180+
} else {
181+
orte_show_help("help-orted.txt", "timedout", true, req->operation);
178182
}
179183
/* don't let the caller hang */
180184
if (NULL != req->opcbfunc) {
@@ -205,6 +209,17 @@ int pmix_server_init(void)
205209

206210
/* setup the server's state variables */
207211
OBJ_CONSTRUCT(&orte_pmix_server_globals.reqs, opal_hotel_t);
212+
/* by the time we init the server, we should know how many nodes we
213+
* have in our environment - with the exception of mpirun. If the
214+
* user specified the size of the hotel, then use that value. Otherwise,
215+
* set the value to something large to avoid running out of rooms on
216+
* large machines */
217+
if (-1 == orte_pmix_server_globals.num_rooms) {
218+
orte_pmix_server_globals.num_rooms = orte_process_info.num_procs * 2;
219+
if (orte_pmix_server_globals.num_rooms < ORTE_PMIX_SERVER_MIN_ROOMS) {
220+
orte_pmix_server_globals.num_rooms = ORTE_PMIX_SERVER_MIN_ROOMS;
221+
}
222+
}
208223
if (OPAL_SUCCESS != (rc = opal_hotel_init(&orte_pmix_server_globals.reqs,
209224
orte_pmix_server_globals.num_rooms,
210225
orte_event_base, orte_pmix_server_globals.timeout*1000000,
@@ -533,13 +548,15 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
533548
* condition, so just log the request and we will fill
534549
* it later */
535550
req = OBJ_NEW(pmix_server_req_t);
551+
(void)asprintf(&req->operation, "DMDX: %s:%d", __FILE__, __LINE__);
536552
req->proxy = *sender;
537553
req->target = idreq;
538554
req->remote_room_num = room_num;
539555
/* adjust the timeout to reflect the size of the job as it can take some
540556
* amount of time to start the job */
541557
ORTE_ADJUST_TIMEOUT(req);
542558
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
559+
ORTE_ERROR_LOG(rc);
543560
OBJ_RELEASE(req);
544561
send_error(rc, &idreq, sender);
545562
}
@@ -558,13 +575,15 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
558575
/* track the request since the call down to the PMIx server
559576
* is asynchronous */
560577
req = OBJ_NEW(pmix_server_req_t);
578+
(void)asprintf(&req->operation, "DMDX: %s:%d", __FILE__, __LINE__);
561579
req->proxy = *sender;
562580
req->target = idreq;
563581
req->remote_room_num = room_num;
564582
/* adjust the timeout to reflect the size of the job as it can take some
565583
* amount of time to start the job */
566584
ORTE_ADJUST_TIMEOUT(req);
567585
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
586+
ORTE_ERROR_LOG(rc);
568587
OBJ_RELEASE(req);
569588
send_error(rc, &idreq, sender);
570589
return;
@@ -696,6 +715,7 @@ OBJ_CLASS_INSTANCE(orte_pmix_server_op_caddy_t,
696715

697716
static void rqcon(pmix_server_req_t *p)
698717
{
718+
p->operation = NULL;
699719
p->target = *ORTE_NAME_INVALID;
700720
p->proxy = *ORTE_NAME_INVALID;
701721
p->timeout = orte_pmix_server_globals.timeout;
@@ -710,6 +730,9 @@ static void rqcon(pmix_server_req_t *p)
710730
}
711731
static void rqdes(pmix_server_req_t *p)
712732
{
733+
if (NULL != p->operation) {
734+
free(p->operation);
735+
}
713736
if (NULL != p->jdata) {
714737
OBJ_RELEASE(p->jdata);
715738
}

orte/orted/pmix/pmix_server_internal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
typedef struct {
6363
opal_object_t super;
6464
opal_event_t ev;
65+
char *operation;
6566
int status;
6667
int timeout;
6768
int room_num;
@@ -109,6 +110,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
109110
do { \
110111
pmix_server_req_t *_req; \
111112
_req = OBJ_NEW(pmix_server_req_t); \
113+
(void)asprintf(&_req->operation, "DMDX: %s:%d", __FILE__, __LINE__); \
112114
_req->target = (p); \
113115
_req->mdxcbfunc = (ocf); \
114116
_req->cbdata = (ocd); \
@@ -122,6 +124,7 @@ OBJ_CLASS_DECLARATION(orte_pmix_mdx_caddy_t);
122124
do { \
123125
pmix_server_req_t *_req; \
124126
_req = OBJ_NEW(pmix_server_req_t); \
127+
(void)asprintf(&_req->operation, "SPAWN: %s:%d", __FILE__, __LINE__); \
125128
_req->jdata = (j); \
126129
_req->spcbfunc = (ocf); \
127130
_req->cbdata = (ocd); \

orte/orted/pmix/pmix_server_pub.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* All rights reserved.
1414
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
16-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2014 Mellanox Technologies, Inc.
1818
* All rights reserved.
1919
* Copyright (c) 2014-2016 Research Organization for Information Science
@@ -100,6 +100,7 @@ int pmix_server_publish_fn(opal_process_name_t *proc,
100100

101101
/* create the caddy */
102102
req = OBJ_NEW(pmix_server_req_t);
103+
(void)asprintf(&req->operation, "PUBLISH: %s:%d", __FILE__, __LINE__);
103104
req->opcbfunc = cbfunc;
104105
req->cbdata = cbdata;
105106

@@ -207,6 +208,7 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys,
207208

208209
/* create the caddy */
209210
req = OBJ_NEW(pmix_server_req_t);
211+
(void)asprintf(&req->operation, "LOOKUP: %s:%d", __FILE__, __LINE__);
210212
req->lkcbfunc = cbfunc;
211213
req->cbdata = cbdata;
212214

@@ -302,6 +304,7 @@ int pmix_server_unpublish_fn(opal_process_name_t *proc, char **keys,
302304

303305
/* create the caddy */
304306
req = OBJ_NEW(pmix_server_req_t);
307+
(void)asprintf(&req->operation, "UNPUBLISH: %s:%d", __FILE__, __LINE__);
305308
req->opcbfunc = cbfunc;
306309
req->cbdata = cbdata;
307310

@@ -468,4 +471,3 @@ void pmix_server_keyval_client(int status, orte_process_name_t* sender,
468471
OBJ_RELEASE(req);
469472
}
470473
}
471-

0 commit comments

Comments
 (0)