@@ -95,50 +95,6 @@ static int mca_pml_monitoring_component_open(void)
9595 return OMPI_SUCCESS ;
9696}
9797
98- static int mca_pml_monitoring_component_close (void )
99- {
100- if ( !mca_common_monitoring_enabled ) return OMPI_SUCCESS ;
101-
102- /**
103- * If this component is already active, then we are currently monitoring
104- * the execution and this call to close if the one from MPI_Finalize.
105- * Clean up and release the extra reference on ourselves.
106- */
107- if ( mca_pml_monitoring_active ) { /* Already active, turn off */
108- pml_selected_component .pmlm_version .mca_close_component ();
109- mca_base_component_repository_release ((mca_base_component_t * )& mca_pml_monitoring_component );
110- mca_pml_monitoring_active = 0 ;
111- return OMPI_SUCCESS ;
112- }
113-
114- /**
115- * We are supposed to monitor the execution. Save the winner PML component and
116- * module, and swap it with ourselves. Increase our refcount so that we are
117- * not dlclose.
118- */
119- if ( OPAL_SUCCESS != mca_base_component_repository_retain_component (mca_pml_monitoring_component .pmlm_version .mca_type_name ,
120- mca_pml_monitoring_component .pmlm_version .mca_component_name ) ) {
121- return OMPI_ERROR ;
122- }
123-
124- /* Save a copy of the selected PML */
125- pml_selected_component = mca_pml_base_selected_component ;
126- pml_selected_module = mca_pml ;
127- /* Install our interception layer */
128- mca_pml_base_selected_component = mca_pml_monitoring_component ;
129- mca_pml = mca_pml_monitoring_module ;
130- /* Restore some of the original values: progress, flags, tags and context id */
131- mca_pml .pml_progress = pml_selected_module .pml_progress ;
132- mca_pml .pml_max_contextid = pml_selected_module .pml_max_contextid ;
133- mca_pml .pml_max_tag = pml_selected_module .pml_max_tag ;
134- /* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
135- mca_pml .pml_flags = pml_selected_module .pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD ;
136-
137- mca_pml_monitoring_active = 1 ;
138-
139- return OMPI_SUCCESS ;
140- }
141-
14298static mca_pml_base_module_t *
14399mca_pml_monitoring_component_init (int * priority ,
144100 bool enable_progress_threads ,
@@ -154,19 +110,72 @@ mca_pml_monitoring_component_init(int* priority,
154110
155111static int mca_pml_monitoring_component_finish (void )
156112{
157- if ( mca_common_monitoring_enabled && mca_pml_monitoring_active ) {
158- /* Free internal data structure */
159- mca_common_monitoring_finalize ();
113+ if ( !mca_common_monitoring_enabled )
114+ return OMPI_SUCCESS ;
115+ if ( !mca_pml_monitoring_active ) {
116+ /* The monitoring component priority is always low to guarantee that the component
117+ * is never selected. Thus, the first time component_finish is called it is right
118+ * after the selection of the best PML was done, and the perfect moment to intercept
119+ * it. At this point we remove ourselves from ompi_pml_base_framework.framework_components
120+ * so that the component never gets closed and unloaded and it's VARs are safe for
121+ * the rest of the execution.
122+ */
123+ mca_pml_base_component_t * component = NULL ;
124+ mca_base_component_list_item_t * cli = NULL ;
125+ OPAL_LIST_FOREACH (cli , & ompi_pml_base_framework .framework_components , mca_base_component_list_item_t ) {
126+ component = (mca_pml_base_component_t * ) cli -> cli_component ;
127+
128+ if ( component == & mca_pml_monitoring_component ) {
129+ opal_list_remove_item (& ompi_pml_base_framework .framework_components , (opal_list_item_t * )cli );
130+ OBJ_RELEASE (cli );
131+ break ;
132+ }
133+ }
134+ /**
135+ * We are supposed to monitor the execution. Save the winner PML component and
136+ * module, and swap it with ourselves. Increase our refcount so that we are
137+ * not dlclose.
138+ */
139+ /* Save a copy of the selected PML */
140+ pml_selected_component = mca_pml_base_selected_component ;
141+ pml_selected_module = mca_pml ;
142+ /* Install our interception layer */
143+ mca_pml_base_selected_component = mca_pml_monitoring_component ;
144+ mca_pml = mca_pml_monitoring_module ;
145+
146+ /* Restore some of the original values: progress, flags, tags and context id */
147+ mca_pml .pml_progress = pml_selected_module .pml_progress ;
148+ mca_pml .pml_max_contextid = pml_selected_module .pml_max_contextid ;
149+ mca_pml .pml_max_tag = pml_selected_module .pml_max_tag ;
150+ /* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
151+ mca_pml .pml_flags = pml_selected_module .pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD ;
152+
153+ mca_pml_monitoring_active = 1 ;
154+ } else {
155+ /**
156+ * This is the second call to component_finalize, and the component is actively
157+ * intercepting the calls to the best PML. Time to stop and cleanly finalize ourself.
158+ */
159+
160160 /* Restore the original PML */
161161 mca_pml_base_selected_component = pml_selected_component ;
162162 mca_pml = pml_selected_module ;
163163 /* Redirect the close call to the original PML */
164164 pml_selected_component .pmlm_finalize ();
165+
166+ /* Free internal data structure */
167+ mca_common_monitoring_finalize ();
168+
165169 /**
166- * We should never release the last ref on the current
167- * component or face forever punishement.
170+ * We are in the compoenent code itself, we need to prevent the dlloader from
171+ * removing the code. This will result in minimal memory leaks, but it is the only
172+ * way to remove most of the references to the component (including the *vars).
168173 */
169- /* mca_base_component_repository_release(&mca_common_monitoring_component.pmlm_version); */
174+ mca_base_component_repository_retain_component (mca_pml_monitoring_component .pmlm_version .mca_type_name ,
175+ mca_pml_monitoring_component .pmlm_version .mca_component_name );
176+ /* Release all memory and be gone. */
177+ mca_base_component_close ((mca_base_component_t * )& mca_pml_monitoring_component ,
178+ ompi_pml_base_framework .framework_output );
170179 }
171180 return OMPI_SUCCESS ;
172181}
@@ -188,7 +197,7 @@ mca_pml_base_component_2_0_0_t mca_pml_monitoring_component = {
188197 .mca_component_name = "monitoring" , /* MCA component name */
189198 MCA_MONITORING_MAKE_VERSION ,
190199 .mca_open_component = mca_pml_monitoring_component_open , /* component open */
191- .mca_close_component = mca_pml_monitoring_component_close , /* component close */
200+ .mca_close_component = NULL , /* component close */
192201 .mca_register_component_params = mca_pml_monitoring_component_register
193202 },
194203 .pmlm_data = {
0 commit comments