Skip to content

Commit bf668ad

Browse files
author
Ralph Castain
authored
Merge pull request #3287 from rhc54/topic/ht
Provide further (hopefully) helpful messages about the hotel size
2 parents 840d6c9 + db8943c commit bf668ad

File tree

5 files changed

+21
-7
lines changed

5 files changed

+21
-7
lines changed

orte/orted/help-orted.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,15 @@ A request has timed out and will therefore fail:
6868

6969
Your job may terminate as a result of this problem. You may want to
7070
adjust the MCA parameter pmix_server_max_wait and try again.
71+
#
72+
[noroom]
73+
A request for an asynchronous runtime operation cannot be fulfilled
74+
because of a lack of room in the tracking array:
75+
76+
Operation: %s
77+
Number of rooms: %d
78+
79+
This is usually caused by a large job that encounters significant
80+
delays across the cluster when starting the application processes.
81+
Your job may terminate as a result of this problem. You may want to
82+
adjust the MCA parameter pmix_server_max_reqs and try again.

orte/orted/pmix/pmix_server.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
556556
* amount of time to start the job */
557557
ORTE_ADJUST_TIMEOUT(req);
558558
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
559-
ORTE_ERROR_LOG(rc);
559+
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
560560
OBJ_RELEASE(req);
561561
send_error(rc, &idreq, sender);
562562
}
@@ -583,7 +583,7 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
583583
* amount of time to start the job */
584584
ORTE_ADJUST_TIMEOUT(req);
585585
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
586-
ORTE_ERROR_LOG(rc);
586+
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
587587
OBJ_RELEASE(req);
588588
send_error(rc, &idreq, sender);
589589
return;

orte/orted/pmix/pmix_server_dyn.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ static void spawn(int sd, short args, void *cbdata)
105105

106106
/* add this request to our tracker hotel */
107107
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
108-
ORTE_ERROR_LOG(rc);
108+
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
109109
goto callback;
110110
}
111111

orte/orted/pmix/pmix_server_fence.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737

3838
#include "orte/mca/errmgr/errmgr.h"
3939
#include "orte/util/name_fns.h"
40+
#include "orte/util/show_help.h"
4041
#include "orte/runtime/orte_globals.h"
4142
#include "orte/mca/grpcomm/grpcomm.h"
4243
#include "orte/mca/rml/rml.h"
@@ -164,7 +165,7 @@ static void dmodex_req(int sd, short args, void *cbdata)
164165
/* save the request in the hotel until the
165166
* data is returned */
166167
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
167-
ORTE_ERROR_LOG(rc);
168+
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
168169
/* can't just return as that would cause the requestor
169170
* to hang, so instead execute the callback */
170171
goto callback;
@@ -180,7 +181,7 @@ static void dmodex_req(int sd, short args, void *cbdata)
180181
* that we don't know about yet. In this case, just
181182
* record the request and we will process it later */
182183
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
183-
ORTE_ERROR_LOG(rc);
184+
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
184185
/* can't just return as that would cause the requestor
185186
* to hang, so instead execute the callback */
186187
goto callback;
@@ -209,7 +210,7 @@ static void dmodex_req(int sd, short args, void *cbdata)
209210
/* track the request so we know the function and cbdata
210211
* to callback upon completion */
211212
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
212-
ORTE_ERROR_LOG(rc);
213+
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
213214
goto callback;
214215
}
215216

orte/orted/pmix/pmix_server_pub.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838

3939
#include "orte/mca/errmgr/errmgr.h"
4040
#include "orte/util/name_fns.h"
41+
#include "orte/util/show_help.h"
4142
#include "orte/runtime/orte_data_server.h"
4243
#include "orte/runtime/orte_globals.h"
4344
#include "orte/mca/rml/rml.h"
@@ -52,7 +53,7 @@ static void execute(int sd, short args, void *cbdata)
5253

5354
/* add this request to our tracker hotel */
5455
if (OPAL_SUCCESS != (rc = opal_hotel_checkin(&orte_pmix_server_globals.reqs, req, &req->room_num))) {
55-
ORTE_ERROR_LOG(rc);
56+
orte_show_help("help-orted.txt", "noroom", true, req->operation, orte_pmix_server_globals.num_rooms);
5657
goto callback;
5758
}
5859

0 commit comments

Comments
 (0)