@@ -95,6 +95,34 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
9595 exit (-1 );
9696}
9797
98+ static size_t handler = SIZE_MAX ;
99+ static bool debugger_register_active = true;
100+ static bool debugger_event_active = true;
101+
102+ static void _release_fn (int status ,
103+ const opal_process_name_t * source ,
104+ opal_list_t * info , opal_list_t * results ,
105+ opal_pmix_notification_complete_fn_t cbfunc ,
106+ void * cbdata )
107+ {
108+ /* must let the notifier know we are done */
109+ if (NULL != cbfunc ) {
110+ cbfunc (ORTE_SUCCESS , NULL , NULL , NULL , cbdata );
111+ }
112+ debugger_event_active = false;
113+ }
114+
115+ static void _register_fn (int status ,
116+ size_t evhandler_ref ,
117+ void * cbdata )
118+ {
119+ opal_list_t * codes = (opal_list_t * )cbdata ;
120+
121+ handler = evhandler_ref ;
122+ OPAL_LIST_RELEASE (codes );
123+ debugger_register_active = false;
124+ }
125+
98126/*
99127 * Wait for a debugger if asked. We support two ways of waiting for
100128 * attaching debuggers -- see big comment in
@@ -103,7 +131,8 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
103131void ompi_rte_wait_for_debugger (void )
104132{
105133 int debugger ;
106- orte_rml_recv_cb_t xfer ;
134+ opal_list_t * codes ;
135+ opal_value_t * kv ;
107136
108137 /* See lengthy comment in orte/tools/orterun/debuggers.c about
109138 orte_in_parallel_debugger */
@@ -133,23 +162,23 @@ void ompi_rte_wait_for_debugger(void)
133162#endif
134163 }
135164 } else {
136- /* only the rank=0 proc waits for either a message from the
137- * HNP or for the debugger to attach - everyone else will just
138- * spin in * the grpcomm barrier in ompi_mpi_init until rank=0
139- * joins them.
140- */
141- if (0 != ORTE_PROC_MY_NAME -> vpid ) {
142- return ;
143- }
144165
145- /* VPID 0 waits for a message from the HNP */
146- OBJ_CONSTRUCT (& xfer , orte_rml_recv_cb_t );
147- xfer .active = true;
148- orte_rml .recv_buffer_nb (OMPI_NAME_WILDCARD ,
149- ORTE_RML_TAG_DEBUGGER_RELEASE ,
150- ORTE_RML_NON_PERSISTENT ,
151- orte_rml_recv_callback , & xfer );
152- /* let the MPI progress engine run while we wait */
153- OMPI_WAIT_FOR_COMPLETION (xfer .active );
166+ /* register an event handler for the ORTE_ERR_DEBUGGER_RELEASE event */
167+ codes = OBJ_NEW (opal_list_t );
168+ kv = OBJ_NEW (opal_value_t );
169+ kv -> key = strdup ("errorcode" );
170+ kv -> type = OPAL_INT ;
171+ kv -> data .integer = ORTE_ERR_DEBUGGER_RELEASE ;
172+ opal_list_append (codes , & kv -> super );
173+
174+ opal_pmix .register_evhandler (codes , NULL , _release_fn , _register_fn , codes );
175+ /* let the MPI progress engine run while we wait for registration to complete */
176+ OMPI_WAIT_FOR_COMPLETION (debugger_register_active );
177+
178+ /* let the MPI progress engine run while we wait for debugger release */
179+ OMPI_WAIT_FOR_COMPLETION (debugger_event_active );
180+
181+ /* deregister the event handler */
182+ opal_pmix .deregister_evhandler (handler , NULL , NULL );
154183 }
155184}
0 commit comments