@@ -83,6 +83,8 @@ static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender,
83
83
opal_buffer_t * buffer ,
84
84
orte_rml_tag_t tg , void * cbdata );
85
85
86
+ #define ORTE_PMIX_SERVER_MIN_ROOMS 4096
87
+
86
88
pmix_server_globals_t orte_pmix_server_globals = {0 };
87
89
88
90
static opal_pmix_server_module_t pmix_server = {
@@ -122,7 +124,7 @@ void pmix_server_register_params(void)
122
124
orte_pmix_server_globals .verbosity );
123
125
}
124
126
/* specify the size of the hotel */
125
- orte_pmix_server_globals .num_rooms = 256 ;
127
+ orte_pmix_server_globals .num_rooms = -1 ;
126
128
(void ) mca_base_var_register ("orte" , "pmix" , NULL , "server_max_reqs" ,
127
129
"Maximum number of backlogged PMIx server direct modex requests" ,
128
130
MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
@@ -158,7 +160,7 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
158
160
{
159
161
pmix_server_req_t * req = (pmix_server_req_t * )occupant ;
160
162
bool timeout = false;
161
- int rc ;
163
+ int rc = OPAL_ERR_TIMEOUT ;
162
164
163
165
/* decrement the request timeout */
164
166
req -> timeout -= orte_pmix_server_globals .timeout ;
@@ -175,6 +177,8 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
175
177
}
176
178
ORTE_ERROR_LOG (rc );
177
179
/* fall thru and return an error so the caller doesn't hang */
180
+ } else {
181
+ orte_show_help ("help-orted.txt" , "timedout" , true, req -> operation );
178
182
}
179
183
/* don't let the caller hang */
180
184
if (NULL != req -> opcbfunc ) {
@@ -205,6 +209,17 @@ int pmix_server_init(void)
205
209
206
210
/* setup the server's state variables */
207
211
OBJ_CONSTRUCT (& orte_pmix_server_globals .reqs , opal_hotel_t );
212
+ /* by the time we init the server, we should know how many nodes we
213
+ * have in our environment - with the exception of mpirun. If the
214
+ * user specified the size of the hotel, then use that value. Otherwise,
215
+ * set the value to something large to avoid running out of rooms on
216
+ * large machines */
217
+ if (-1 == orte_pmix_server_globals .num_rooms ) {
218
+ orte_pmix_server_globals .num_rooms = orte_process_info .num_procs * 2 ;
219
+ if (orte_pmix_server_globals .num_rooms < ORTE_PMIX_SERVER_MIN_ROOMS ) {
220
+ orte_pmix_server_globals .num_rooms = ORTE_PMIX_SERVER_MIN_ROOMS ;
221
+ }
222
+ }
208
223
if (OPAL_SUCCESS != (rc = opal_hotel_init (& orte_pmix_server_globals .reqs ,
209
224
orte_pmix_server_globals .num_rooms ,
210
225
orte_event_base , orte_pmix_server_globals .timeout * 1000000 ,
@@ -533,13 +548,15 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
533
548
* condition, so just log the request and we will fill
534
549
* it later */
535
550
req = OBJ_NEW (pmix_server_req_t );
551
+ (void )asprintf (& req -> operation , "DMDX: %s:%d" , __FILE__ , __LINE__ );
536
552
req -> proxy = * sender ;
537
553
req -> target = idreq ;
538
554
req -> remote_room_num = room_num ;
539
555
/* adjust the timeout to reflect the size of the job as it can take some
540
556
* amount of time to start the job */
541
557
ORTE_ADJUST_TIMEOUT (req );
542
558
if (OPAL_SUCCESS != (rc = opal_hotel_checkin (& orte_pmix_server_globals .reqs , req , & req -> room_num ))) {
559
+ ORTE_ERROR_LOG (rc );
543
560
OBJ_RELEASE (req );
544
561
send_error (rc , & idreq , sender );
545
562
}
@@ -558,13 +575,15 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
558
575
/* track the request since the call down to the PMIx server
559
576
* is asynchronous */
560
577
req = OBJ_NEW (pmix_server_req_t );
578
+ (void )asprintf (& req -> operation , "DMDX: %s:%d" , __FILE__ , __LINE__ );
561
579
req -> proxy = * sender ;
562
580
req -> target = idreq ;
563
581
req -> remote_room_num = room_num ;
564
582
/* adjust the timeout to reflect the size of the job as it can take some
565
583
* amount of time to start the job */
566
584
ORTE_ADJUST_TIMEOUT (req );
567
585
if (OPAL_SUCCESS != (rc = opal_hotel_checkin (& orte_pmix_server_globals .reqs , req , & req -> room_num ))) {
586
+ ORTE_ERROR_LOG (rc );
568
587
OBJ_RELEASE (req );
569
588
send_error (rc , & idreq , sender );
570
589
return ;
@@ -696,6 +715,7 @@ OBJ_CLASS_INSTANCE(orte_pmix_server_op_caddy_t,
696
715
697
716
static void rqcon (pmix_server_req_t * p )
698
717
{
718
+ p -> operation = NULL ;
699
719
p -> target = * ORTE_NAME_INVALID ;
700
720
p -> proxy = * ORTE_NAME_INVALID ;
701
721
p -> timeout = orte_pmix_server_globals .timeout ;
@@ -710,6 +730,9 @@ static void rqcon(pmix_server_req_t *p)
710
730
}
711
731
static void rqdes (pmix_server_req_t * p )
712
732
{
733
+ if (NULL != p -> operation ) {
734
+ free (p -> operation );
735
+ }
713
736
if (NULL != p -> jdata ) {
714
737
OBJ_RELEASE (p -> jdata );
715
738
}
0 commit comments