Skip to content

Commit 9132bb2

Browse files
author
Ralph Castain
authored
Merge pull request #3281 from rhc54/topic/dmx
Adjust the timeout for direct modex requests to reflect the size of t…
2 parents 9cb18b8 + 734b90a commit 9132bb2

File tree

3 files changed

+20
-1
lines changed

3 files changed

+20
-1
lines changed

orte/orted/pmix/pmix_server.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
536536
req->proxy = *sender;
537537
req->target = idreq;
538538
req->remote_room_num = room_num;
539+
/* adjust the timeout to reflect the size of the job as it can take some
540+
* amount of time to start the job */
541+
ORTE_ADJUST_TIMEOUT(req);
539542
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
540543
OBJ_RELEASE(req);
541544
send_error(rc, &idreq, sender);
@@ -558,6 +561,9 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
558561
req->proxy = *sender;
559562
req->target = idreq;
560563
req->remote_room_num = room_num;
564+
/* adjust the timeout to reflect the size of the job as it can take some
565+
* amount of time to start the job */
566+
ORTE_ADJUST_TIMEOUT(req);
561567
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
562568
OBJ_RELEASE(req);
563569
send_error(rc, &idreq, sender);

orte/orted/pmix/pmix_server_fence.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* All rights reserved.
1414
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
16-
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2014 Mellanox Technologies, Inc.
1818
* All rights reserved.
1919
* Copyright (c) 2014-2017 Research Organization for Information Science
@@ -148,6 +148,10 @@ static void dmodex_req(int sd, short args, void *cbdata)
148148
return;
149149
}
150150

151+
/* adjust the timeout to reflect the size of the job as it can take some
152+
* amount of time to start the job */
153+
ORTE_ADJUST_TIMEOUT(req);
154+
151155
/* has anyone already requested data for this target? If so,
152156
* then the data is already on its way */
153157
for (rnum=0; rnum < orte_pmix_server_globals.reqs.num_rooms; rnum++) {

orte/orted/pmix/pmix_server_internal.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,15 @@
4848

4949
BEGIN_C_DECLS
5050

51+
#define ORTED_PMIX_MIN_DMX_TIMEOUT 10
52+
#define ORTE_ADJUST_TIMEOUT(a) \
53+
do { \
54+
(a)->timeout = (2 * orte_process_info.num_daemons) / 1000; \
55+
if ((a)->timeout < ORTED_PMIX_MIN_DMX_TIMEOUT) { \
56+
(a)->timeout = ORTED_PMIX_MIN_DMX_TIMEOUT; \
57+
} \
58+
} while(0)
59+
5160
/* object for tracking requests so we can
5261
* correctly route the eventual reply */
5362
typedef struct {

0 commit comments

Comments
 (0)