@@ -83,6 +83,8 @@ static void pmix_server_dmdx_resp(int status, orte_process_name_t* sender,
8383 opal_buffer_t * buffer ,
8484 orte_rml_tag_t tg , void * cbdata );
8585
86+ #define ORTE_PMIX_SERVER_MIN_ROOMS 4096
87+
8688pmix_server_globals_t orte_pmix_server_globals = {0 };
8789
8890static opal_pmix_server_module_t pmix_server = {
@@ -122,7 +124,7 @@ void pmix_server_register_params(void)
122124 orte_pmix_server_globals .verbosity );
123125 }
124126 /* specify the size of the hotel */
125- orte_pmix_server_globals .num_rooms = 256 ;
127+ orte_pmix_server_globals .num_rooms = -1 ;
126128 (void ) mca_base_var_register ("orte" , "pmix" , NULL , "server_max_reqs" ,
127129 "Maximum number of backlogged PMIx server direct modex requests" ,
128130 MCA_BASE_VAR_TYPE_INT , NULL , 0 , 0 ,
@@ -158,7 +160,7 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
158160{
159161 pmix_server_req_t * req = (pmix_server_req_t * )occupant ;
160162 bool timeout = false;
161- int rc ;
163+ int rc = OPAL_ERR_TIMEOUT ;
162164
163165 /* decrement the request timeout */
164166 req -> timeout -= orte_pmix_server_globals .timeout ;
@@ -175,6 +177,8 @@ static void eviction_cbfunc(struct opal_hotel_t *hotel,
175177 }
176178 ORTE_ERROR_LOG (rc );
177179 /* fall thru and return an error so the caller doesn't hang */
180+ } else {
181+ orte_show_help ("help-orted.txt" , "timedout" , true, req -> operation );
178182 }
179183 /* don't let the caller hang */
180184 if (NULL != req -> opcbfunc ) {
@@ -205,6 +209,17 @@ int pmix_server_init(void)
205209
206210 /* setup the server's state variables */
207211 OBJ_CONSTRUCT (& orte_pmix_server_globals .reqs , opal_hotel_t );
212+ /* by the time we init the server, we should know how many nodes we
213+ * have in our environment - with the exception of mpirun. If the
214+ * user specified the size of the hotel, then use that value. Otherwise,
215+ * set the value to something large to avoid running out of rooms on
216+ * large machines */
217+ if (-1 == orte_pmix_server_globals .num_rooms ) {
218+ orte_pmix_server_globals .num_rooms = orte_process_info .num_procs * 2 ;
219+ if (orte_pmix_server_globals .num_rooms < ORTE_PMIX_SERVER_MIN_ROOMS ) {
220+ orte_pmix_server_globals .num_rooms = ORTE_PMIX_SERVER_MIN_ROOMS ;
221+ }
222+ }
208223 if (OPAL_SUCCESS != (rc = opal_hotel_init (& orte_pmix_server_globals .reqs ,
209224 orte_pmix_server_globals .num_rooms ,
210225 orte_event_base , orte_pmix_server_globals .timeout * 1000000 ,
@@ -533,13 +548,15 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
533548 * condition, so just log the request and we will fill
534549 * it later */
535550 req = OBJ_NEW (pmix_server_req_t );
551+ (void )asprintf (& req -> operation , "DMDX: %s:%d" , __FILE__ , __LINE__ );
536552 req -> proxy = * sender ;
537553 req -> target = idreq ;
538554 req -> remote_room_num = room_num ;
539555 /* adjust the timeout to reflect the size of the job as it can take some
540556 * amount of time to start the job */
541557 ORTE_ADJUST_TIMEOUT (req );
542558 if (OPAL_SUCCESS != (rc = opal_hotel_checkin (& orte_pmix_server_globals .reqs , req , & req -> room_num ))) {
559+ ORTE_ERROR_LOG (rc );
543560 OBJ_RELEASE (req );
544561 send_error (rc , & idreq , sender );
545562 }
@@ -558,13 +575,15 @@ static void pmix_server_dmdx_recv(int status, orte_process_name_t* sender,
558575 /* track the request since the call down to the PMIx server
559576 * is asynchronous */
560577 req = OBJ_NEW (pmix_server_req_t );
578+ (void )asprintf (& req -> operation , "DMDX: %s:%d" , __FILE__ , __LINE__ );
561579 req -> proxy = * sender ;
562580 req -> target = idreq ;
563581 req -> remote_room_num = room_num ;
564582 /* adjust the timeout to reflect the size of the job as it can take some
565583 * amount of time to start the job */
566584 ORTE_ADJUST_TIMEOUT (req );
567585 if (OPAL_SUCCESS != (rc = opal_hotel_checkin (& orte_pmix_server_globals .reqs , req , & req -> room_num ))) {
586+ ORTE_ERROR_LOG (rc );
568587 OBJ_RELEASE (req );
569588 send_error (rc , & idreq , sender );
570589 return ;
@@ -696,6 +715,7 @@ OBJ_CLASS_INSTANCE(orte_pmix_server_op_caddy_t,
696715
697716static void rqcon (pmix_server_req_t * p )
698717{
718+ p -> operation = NULL ;
699719 p -> target = * ORTE_NAME_INVALID ;
700720 p -> proxy = * ORTE_NAME_INVALID ;
701721 p -> timeout = orte_pmix_server_globals .timeout ;
@@ -710,6 +730,9 @@ static void rqcon(pmix_server_req_t *p)
710730}
711731static void rqdes (pmix_server_req_t * p )
712732{
733+ if (NULL != p -> operation ) {
734+ free (p -> operation );
735+ }
713736 if (NULL != p -> jdata ) {
714737 OBJ_RELEASE (p -> jdata );
715738 }
0 commit comments