Skip to content

Commit d1ecc83

Browse files
authored
Merge pull request #2245 from jjhursey/topic/libnbc-error-path
coll/libnbc: Fix error path on internal error
2 parents 2a9f818 + 8748e54 commit d1ecc83

File tree

2 files changed

+39
-4
lines changed

2 files changed

+39
-4
lines changed

ompi/mca/coll/libnbc/coll_libnbc_component.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
1515
* reserved.
16+
* Copyright (c) 2016 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
18+
* Copyright (c) 2016 IBM Corporation. All rights reserved.
1619
* $COPYRIGHT$
1720
*
1821
* Additional copyrights may follow
@@ -234,17 +237,24 @@ int
234237
ompi_coll_libnbc_progress(void)
235238
{
236239
ompi_coll_libnbc_request_t* request, *next;
240+
int res;
237241

238242
if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0;
239243

240244
OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests,
241245
ompi_coll_libnbc_request_t) {
242-
if (OMPI_SUCCESS == NBC_Progress(request)) {
246+
res = NBC_Progress(request);
247+
if( NBC_CONTINUE != res ) {
243248
/* done, remove and complete */
244249
opal_list_remove_item(&mca_coll_libnbc_component.active_requests,
245250
&request->super.super.super);
246251

247-
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
252+
if( OMPI_SUCCESS == res || NBC_OK == res || NBC_SUCCESS == res ) {
253+
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
254+
}
255+
else {
256+
request->super.req_status.MPI_ERROR = res;
257+
}
248258
OPAL_THREAD_LOCK(&ompi_request_lock);
249259
ompi_request_complete(&request->super, true);
250260
OPAL_THREAD_UNLOCK(&ompi_request_lock);

ompi/mca/coll/libnbc/nbc.c

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* Author(s): Torsten Hoefler <[email protected]>
1717
*
1818
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
19+
* Copyright (c) 2016 IBM Corporation. All rights reserved.
1920
*
2021
*/
2122
#include "nbc_internal.h"
@@ -312,6 +313,8 @@ int NBC_Progress(NBC_Handle *handle) {
312313
int flag, res, ret=NBC_CONTINUE;
313314
unsigned long size = 0;
314315
char *delim;
316+
int i;
317+
ompi_status_public_t status;
315318

316319
/* the handle is done if there is no schedule attached */
317320
if (NULL == handle->schedule) {
@@ -325,8 +328,30 @@ int NBC_Progress(NBC_Handle *handle) {
325328
#endif
326329
res = ompi_request_test_all(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE);
327330
if(res != OMPI_SUCCESS) {
328-
NBC_Error ("MPI Error in MPI_Testall() (%i)", res);
329-
return res;
331+
// Attempt to cancel outstanding requests
332+
for(i = 0; i < handle->req_count; ++i ) {
333+
// If the request is complete, then try to report the error code
334+
if( handle->req_array[i]->req_complete ) {
335+
if( OMPI_SUCCESS != handle->req_array[i]->req_status.MPI_ERROR ) {
336+
NBC_Error ("MPI Error in MPI_Testall() (req %d = %d)", i, handle->req_array[i]->req_status.MPI_ERROR);
337+
}
338+
}
339+
else {
340+
ompi_request_cancel(handle->req_array[i]);
341+
// If the PML actually canceled the request, then wait on it
342+
if( handle->req_array[i]->req_status._cancelled) {
343+
ompi_request_wait(&handle->req_array[i], &status);
344+
}
345+
// Warn the user that we had to leave a PML message outstanding so
346+
// bad things could happen if they continue using nonblocking collectives
347+
else {
348+
NBC_Error ("MPI Error: Not able to cancel the internal request %d. "
349+
"Be aware that continuing to use nonblocking collectives on this communicator may result in undefined behavior.", i);
350+
}
351+
}
352+
}
353+
354+
return OMPI_ERROR;
330355
}
331356
#ifdef NBC_TIMING
332357
Test_time += MPI_Wtime();

0 commit comments

Comments
 (0)