5252
5353extern ompi_rte_orte_component_t mca_rte_orte_component ;
5454
55- typedef struct {
56- volatile bool active ;
57- int status ;
58- int errhandler ;
59- } errhandler_t ;
60-
61- static void register_cbfunc (int status , int errhndler , void * cbdata )
62- {
63- errhandler_t * cd = (errhandler_t * )cbdata ;
64- cd -> status = status ;
65- cd -> errhandler = errhndler ;
66- cd -> active = false;
67- }
68-
69- static volatile bool wait_for_release = true;
70- static int errhandler = -1 ;
71-
72- static void notify_cbfunc (int status ,
73- opal_list_t * procs ,
74- opal_list_t * info ,
75- opal_pmix_release_cbfunc_t cbfunc ,
76- void * cbdata )
77- {
78- if (NULL != cbfunc ) {
79- cbfunc (cbdata );
80- }
81- wait_for_release = false;
82- }
83-
84-
85- int ompi_rte_init (int * pargc , char * * * pargv )
86- {
87- int rc ;
88- opal_list_t info ;
89- opal_value_t val ;
90- errhandler_t cd ;
91-
92- if (ORTE_SUCCESS != (rc = orte_init (pargc , pargv , ORTE_PROC_MPI ))) {
93- return rc ;
94- }
95-
96- if (!orte_standalone_operation ) {
97- /* register to receive any debugger release */
98- OBJ_CONSTRUCT (& info , opal_list_t );
99- OBJ_CONSTRUCT (& val , opal_value_t );
100- val .key = strdup (OPAL_PMIX_ERROR_NAME );
101- val .type = OPAL_INT ;
102- val .data .integer = OPAL_ERR_DEBUGGER_RELEASE ;
103- opal_list_append (& info , & val .super );
104- cd .status = ORTE_ERROR ;
105- cd .errhandler = -1 ;
106- cd .active = true;
107-
108- opal_pmix .register_errhandler (& info , notify_cbfunc , register_cbfunc , & cd );
109-
110- /* let the MPI progress engine run while we wait for
111- * registration to complete */
112- OMPI_WAIT_FOR_COMPLETION (cd .active );
113- /* safely deconstruct the list */
114- opal_list_remove_first (& info );
115- OBJ_DESTRUCT (& val );
116- OBJ_DESTRUCT (& info );
117- if (OPAL_SUCCESS != cd .status ) {
118- /* ouch - we are doomed */
119- ORTE_ERROR_LOG (cd .status );
120- return OMPI_ERROR ;
121- }
122- errhandler = cd .errhandler ;
123- }
124-
125- return OMPI_SUCCESS ;
126- }
127-
12855void ompi_rte_abort (int error_code , char * fmt , ...)
12956{
13057 va_list arglist ;
@@ -173,10 +100,10 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
173100 * attaching debuggers -- see big comment in
174101 * orte/tools/orterun/debuggers.c explaining the two scenarios.
175102 */
176-
177103void ompi_rte_wait_for_debugger (void )
178104{
179105 int debugger ;
106+ orte_rml_recv_cb_t xfer ;
180107
181108 /* See lengthy comment in orte/tools/orterun/debuggers.c about
182109 orte_in_parallel_debugger */
@@ -186,16 +113,16 @@ void ompi_rte_wait_for_debugger(void)
186113 debugger = 1 ;
187114 }
188115
189- if (!debugger ) {
116+ if (!debugger && NULL == getenv ( "ORTE_TEST_DEBUGGER_ATTACH" ) ) {
190117 /* if not, just return */
191118 return ;
192119 }
120+
193121 /* if we are being debugged, then we need to find
194122 * the correct plug-ins
195123 */
196124 ompi_debugger_setup_dlls ();
197125
198- /* wait for the debugger to attach */
199126 if (orte_standalone_operation ) {
200127 /* spin until debugger attaches and releases us */
201128 while (MPIR_debug_gate == 0 ) {
@@ -206,9 +133,23 @@ void ompi_rte_wait_for_debugger(void)
206133#endif
207134 }
208135 } else {
209- /* now wait for the notification to occur */
210- OMPI_WAIT_FOR_COMPLETION (wait_for_release );
211- /* deregister the errhandler */
212- opal_pmix .deregister_errhandler (errhandler , NULL , NULL );
136+ /* only the rank=0 proc waits for either a message from the
137+ * HNP or for the debugger to attach - everyone else will just
138+ * spin in * the grpcomm barrier in ompi_mpi_init until rank=0
139+ * joins them.
140+ */
141+ if (0 != ORTE_PROC_MY_NAME -> vpid ) {
142+ return ;
143+ }
144+
145+ /* VPID 0 waits for a message from the HNP */
146+ OBJ_CONSTRUCT (& xfer , orte_rml_recv_cb_t );
147+ xfer .active = true;
148+ orte_rml .recv_buffer_nb (OMPI_NAME_WILDCARD ,
149+ ORTE_RML_TAG_DEBUGGER_RELEASE ,
150+ ORTE_RML_NON_PERSISTENT ,
151+ orte_rml_recv_callback , & xfer );
152+ /* let the MPI progress engine run while we wait */
153+ OMPI_WAIT_FOR_COMPLETION (xfer .active );
213154 }
214155}
0 commit comments