Skip to content

Commit 734b90a

Browse files
author
Ralph Castain
committed
Adjust the timeout for direct modex requests to reflect the size of the job. It can take several seconds to start all the procs, and we don't want to timeout due to differences in start times of the various procs
Signed-off-by: Ralph Castain <[email protected]>
1 parent 9cb18b8 commit 734b90a

File tree

3 files changed

+20
-1
lines changed

3 files changed

+20
-1
lines changed

orte/orted/pmix/pmix_server.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
536536
req->proxy = *sender;
537537
req->target = idreq;
538538
req->remote_room_num = room_num;
539+
/* adjust the timeout to reflect the size of the job as it can take some
540+
* amount of time to start the job */
541+
ORTE_ADJUST_TIMEOUT(req);
539542
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
540543
OBJ_RELEASE(req);
541544
send_error(rc, &idreq, sender);
@@ -558,6 +561,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
558561
req->proxy = *sender;
559562
req->target = idreq;
560563
req->remote_room_num = room_num;
564+
/* adjust the timeout to reflect the size of the job as it can take some
565+
* amount of time to start the job */
566+
ORTE_ADJUST_TIMEOUT(req);
561567
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
562568
OBJ_RELEASE(req);
563569
send_error(rc, &idreq, sender);

orte/orted/pmix/pmix_server_fence.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* All rights reserved.
1414
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
16-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2014 Mellanox Technologies, Inc.
1818
* All rights reserved.
1919
* Copyright (c) 2014-2017 Research Organization for Information Science
@@ -148,6 +148,10 @@ static void dmodex_req(int sd, short args, void *cbdata)
148148
return;
149149
}
150150

151+
/* adjust the timeout to reflect the size of the job as it can take some
152+
* amount of time to start the job */
153+
ORTE_ADJUST_TIMEOUT(req);
154+
151155
/* has anyone already requested data for this target? If so,
152156
* then the data is already on its way */
153157
for (rnum=0; rnum < orte_pmix_server_globals.reqs.num_rooms; rnum++) {

orte/orted/pmix/pmix_server_internal.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,15 @@
4848

4949
BEGIN_C_DECLS
5050

51+
#define ORTED_PMIX_MIN_DMX_TIMEOUT 10
52+
#define ORTE_ADJUST_TIMEOUT(a) \
53+
do { \
54+
(a)->timeout = (2 * orte_process_info.num_daemons) / 1000; \
55+
if ((a)->timeout < ORTED_PMIX_MIN_DMX_TIMEOUT) { \
56+
(a)->timeout = ORTED_PMIX_MIN_DMX_TIMEOUT; \
57+
} \
58+
} while(0)
59+
5160
/* object for tracking requests so we can
5261
* correctly route the eventual reply */
5362
typedef struct {

0 commit comments

Comments
 (0)