Skip to content

Commit 3095fab

Browse files
committed
Fix mca_btl_ofi_finalize clean-up logic
This fix is from John L. Byrne ([email protected]). When OFI Libfabric binds objects to endpoints, before the object can be successfully closed, the endpoint must first be freed. For scalable endpoints, objects can also be bound to transmit and receive contexts, and for objects that are bound to contexts, we need to first free the contexts before freeing the endpoint. We also need to clear the memory registration cache. If we don't clean up properly, then fi\_close may not be able to close the domain because the dom will have a non-zero ref count. Signed-off-by: harumi kuno <[email protected]>
1 parent 77bf3f0 commit 3095fab

File tree

2 files changed

+24
-13
lines changed

2 files changed

+24
-13
lines changed

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -581,21 +581,25 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
581581
fail:
582582
/* clean up */
583583

584+
if (NULL != ep && !module->is_scalable_ep) {
585+
fi_close(&ep->fid);
586+
}
587+
584588
/* if the contexts have not been initiated, num_contexts should
585589
* be zero and we skip this. */
586590
for (int i=0; i < module->num_contexts; i++) {
587591
mca_btl_ofi_context_finalize(&module->contexts[i], module->is_scalable_ep);
588592
}
589593
free(module->contexts);
590594

591-
if (NULL != av) {
592-
fi_close(&av->fid);
593-
}
594-
595595
if (NULL != ep) {
596596
fi_close(&ep->fid);
597597
}
598598

599+
if (NULL != av) {
600+
fi_close(&av->fid);
601+
}
602+
599603
if (NULL != domain) {
600604
fi_close(&domain->fid);
601605
}

opal/mca/btl/ofi/btl_ofi_module.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -277,20 +277,32 @@ int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
277277

278278
assert(btl);
279279

280+
/* clear the rcache */
281+
if (ofi_btl->rcache) {
282+
mca_rcache_base_module_destroy (ofi_btl->rcache);
283+
ofi_btl->rcache = NULL;
284+
}
285+
286+
/* For a standard ep, we need to close the ep first. */
287+
if (NULL != ofi_btl->ofi_endpoint && !ofi_btl->is_scalable_ep) {
288+
fi_close(&ofi_btl->ofi_endpoint->fid);
289+
ofi_btl->ofi_endpoint = NULL;
290+
}
291+
280292
/* loop over all the contexts */
281293
for (i=0; i < ofi_btl->num_contexts; i++) {
282294
mca_btl_ofi_context_finalize(&ofi_btl->contexts[i], ofi_btl->is_scalable_ep);
283295
}
284296
free(ofi_btl->contexts);
285297

286-
if (NULL != ofi_btl->av) {
287-
fi_close(&ofi_btl->av->fid);
288-
}
289-
290298
if (NULL != ofi_btl->ofi_endpoint) {
291299
fi_close(&ofi_btl->ofi_endpoint->fid);
292300
}
293301

302+
if (NULL != ofi_btl->av) {
303+
fi_close(&ofi_btl->av->fid);
304+
}
305+
294306
if (NULL != ofi_btl->domain) {
295307
fi_close(&ofi_btl->domain->fid);
296308
}
@@ -313,11 +325,6 @@ int mca_btl_ofi_finalize (mca_btl_base_module_t* btl)
313325
OBJ_DESTRUCT(&ofi_btl->id_to_endpoint);
314326
OBJ_DESTRUCT(&ofi_btl->module_lock);
315327

316-
if (ofi_btl->rcache) {
317-
mca_rcache_base_module_destroy (ofi_btl->rcache);
318-
ofi_btl->rcache = NULL;
319-
}
320-
321328
free (btl);
322329

323330
return OPAL_SUCCESS;

0 commit comments

Comments
 (0)