Skip to content

Commit fa3f128

Browse files
committed
ompi/request: fix bugs in MPI_Wait_some and MPI_Wait_any
This commit fixes two bugs in MPI_Wait_any: - If all requests are inactive then the sync wait would hang forever because no requests are attached to the sync. - The request pointer was pointing to the request before the completed request which caused the wrong request to be freed or marked inactive. MPI_Wait_some had a similar issue if all the requests were pending. These issues were identified by MTT. Signed-off-by: Nathan Hjelm <[email protected]> (cherry picked from open-mpi/ompi@0591139) Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 6aa007a commit fa3f128

File tree

1 file changed

+94
-81
lines changed

1 file changed

+94
-81
lines changed

ompi/request/req_wait.c

Lines changed: 94 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
34
* University Research and Technology
@@ -12,7 +13,7 @@
1213
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
1314
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
1415
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
15-
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
16+
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
1617
* reserved.
1718
* $COPYRIGHT$
1819
*
@@ -82,17 +83,17 @@ int ompi_request_default_wait_any(size_t count,
8283
int *index,
8384
ompi_status_public_t * status)
8485
{
85-
size_t i = 0, completed = count, num_requests_null_inactive = 0;
86+
size_t completed = count, num_requests_null_inactive = 0;
8687
int rc = OMPI_SUCCESS;
8788
ompi_request_t **rptr=NULL;
8889
ompi_request_t *request=NULL;
8990
ompi_wait_sync_t sync;
9091

9192
WAIT_SYNC_INIT(&sync, 1);
92-
93+
9394
rptr = requests;
9495
num_requests_null_inactive = 0;
95-
for (i = 0; i < count; i++, rptr++) {
96+
for (size_t i = 0; i < count; i++, rptr++) {
9697
request = *rptr;
9798

9899
/* Sanity test */
@@ -108,18 +109,29 @@ int ompi_request_default_wait_any(size_t count,
108109
num_requests_null_inactive++;
109110
continue;
110111
}
112+
111113
if(!OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, REQUEST_PENDING, &sync)) {
112114
assert(REQUEST_COMPLETE(request));
113115
wait_sync_update( &sync, 1, request->req_status.MPI_ERROR);
114116
completed = i;
115117
break;
116118
}
117119
}
120+
121+
if(num_requests_null_inactive == count) {
122+
*index = MPI_UNDEFINED;
123+
if (MPI_STATUS_IGNORE != status) {
124+
*status = ompi_status_empty;
125+
}
126+
WAIT_SYNC_RELEASE(&sync);
127+
return rc;
128+
}
129+
118130
SYNC_WAIT(&sync);
119131

120132
/* recheck the complete status and clean up the sync primitives */
121133
rptr = requests;
122-
for(i = 0; i < completed; i++, rptr++) {
134+
for(size_t i = 0, pending_count = completed; i < pending_count ; i++, rptr++) {
123135
request = *rptr;
124136

125137
if( request->req_state == OMPI_REQUEST_INACTIVE ) {
@@ -131,39 +143,38 @@ int ompi_request_default_wait_any(size_t count,
131143
* marked as REQUEST_COMPLETE.
132144
*/
133145
OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, &sync, REQUEST_PENDING);
146+
if (REQUEST_COMPLETE(request)) {
147+
completed = i;
148+
}
134149
}
135150

136-
if(num_requests_null_inactive == count) {
137-
*index = MPI_UNDEFINED;
138-
if (MPI_STATUS_IGNORE != status) {
139-
*status = ompi_status_empty;
140-
}
141-
} else {
142-
assert( REQUEST_COMPLETE(request) );
143-
/* Per note above, we have to call gen request query_fn even
144-
if STATUS_IGNORE was provided */
145-
if (OMPI_REQUEST_GEN == request->req_type) {
146-
rc = ompi_grequest_invoke_query(request, &request->req_status);
147-
}
148-
if (MPI_STATUS_IGNORE != status) {
149-
/* Do *NOT* set status->MPI_ERROR here! See MPI-1.1 doc,
150-
sec 3.2.5, p.22 */
151-
int old_error = status->MPI_ERROR;
152-
*status = request->req_status;
153-
status->MPI_ERROR = old_error;
154-
}
155-
rc = request->req_status.MPI_ERROR;
156-
if( request->req_persistent ) {
157-
request->req_state = OMPI_REQUEST_INACTIVE;
158-
} else if (MPI_SUCCESS == rc) {
159-
/* Only free the request if there is no error on it */
160-
/* If there's an error while freeing the request,
161-
assume that the request is still there. Otherwise,
162-
Bad Things will happen later! */
163-
rc = ompi_request_free(rptr);
164-
}
165-
*index = completed;
151+
rptr = requests + completed;
152+
request = *rptr;
153+
154+
assert( REQUEST_COMPLETE(request) );
155+
/* Per note above, we have to call gen request query_fn even
156+
if STATUS_IGNORE was provided */
157+
if (OMPI_REQUEST_GEN == request->req_type) {
158+
rc = ompi_grequest_invoke_query(request, &request->req_status);
159+
}
160+
if (MPI_STATUS_IGNORE != status) {
161+
/* Do *NOT* set status->MPI_ERROR here! See MPI-1.1 doc,
162+
sec 3.2.5, p.22 */
163+
int old_error = status->MPI_ERROR;
164+
*status = request->req_status;
165+
status->MPI_ERROR = old_error;
166+
}
167+
rc = request->req_status.MPI_ERROR;
168+
if( request->req_persistent ) {
169+
request->req_state = OMPI_REQUEST_INACTIVE;
170+
} else if (MPI_SUCCESS == rc) {
171+
/* Only free the request if there is no error on it */
172+
/* If there's an error while freeing the request,
173+
assume that the request is still there. Otherwise,
174+
Bad Things will happen later! */
175+
rc = ompi_request_free(rptr);
166176
}
177+
*index = completed;
167178

168179
WAIT_SYNC_RELEASE(&sync);
169180
return rc;
@@ -327,7 +338,7 @@ int ompi_request_default_wait_some(size_t count,
327338
int * indices,
328339
ompi_status_public_t * statuses)
329340
{
330-
size_t i, num_requests_null_inactive=0, num_requests_done=0;
341+
size_t num_requests_null_inactive=0, num_requests_done=0;
331342
int rc = MPI_SUCCESS;
332343
ompi_request_t **rptr = NULL;
333344
ompi_request_t *request = NULL;
@@ -344,7 +355,7 @@ int ompi_request_default_wait_some(size_t count,
344355
rptr = requests;
345356
num_requests_null_inactive = 0;
346357
num_requests_done = 0;
347-
for (i = 0; i < count; i++, rptr++) {
358+
for (size_t i = 0; i < count; i++, rptr++) {
348359
request = *rptr;
349360
/*
350361
* Check for null or completed persistent request.
@@ -360,68 +371,70 @@ int ompi_request_default_wait_some(size_t count,
360371
num_requests_done++;
361372
}
362373
}
374+
if(num_requests_null_inactive == count) {
375+
*outcount = MPI_UNDEFINED;
376+
WAIT_SYNC_RELEASE(&sync);
377+
return rc;
378+
}
379+
363380
if( 0 != num_requests_done ) {
364381
/* As we only expect one trigger update the sync with count 1 */
365382
wait_sync_update(&sync, 1, request->req_status.MPI_ERROR);
366383
}
367384
SYNC_WAIT(&sync);
368385

369-
if(num_requests_null_inactive == count) {
370-
*outcount = MPI_UNDEFINED;
371-
} else {
372-
373-
/* Do the final counting and */
374-
/* Clean up the synchronization primitives */
386+
/* Do the final counting and */
387+
/* Clean up the synchronization primitives */
375388

376-
rptr = requests;
377-
num_requests_done = 0;
378-
for (i = 0; i < count; i++, rptr++) {
379-
request = *rptr;
389+
rptr = requests;
390+
num_requests_done = 0;
391+
for (size_t i = 0; i < count; i++, rptr++) {
392+
request = *rptr;
380393

381-
if( request->req_state == OMPI_REQUEST_INACTIVE ) {
382-
continue;
383-
}
394+
if( request->req_state == OMPI_REQUEST_INACTIVE ) {
395+
continue;
396+
}
384397

385-
if( !OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, &sync, REQUEST_PENDING) ) {
386-
assert(REQUEST_COMPLETE(request));
387-
indices[num_requests_done] = i;
388-
num_requests_done++;
389-
}
398+
if( !OPAL_ATOMIC_CMPSET_PTR(&request->req_complete, &sync, REQUEST_PENDING) ) {
399+
assert(REQUEST_COMPLETE(request));
400+
indices[num_requests_done] = i;
401+
num_requests_done++;
390402
}
403+
}
391404

392-
WAIT_SYNC_RELEASE(&sync);
405+
WAIT_SYNC_RELEASE(&sync);
393406

394-
*outcount = num_requests_done;
407+
*outcount = num_requests_done;
395408

396-
for (i = 0; i < num_requests_done; i++) {
397-
request = requests[indices[i]];
398-
assert( REQUEST_COMPLETE(request) );
399-
/* Per note above, we have to call gen request query_fn even
400-
if STATUS_IGNORE was provided */
401-
if (OMPI_REQUEST_GEN == request->req_type) {
402-
ompi_grequest_invoke_query(request, &request->req_status);
403-
}
404-
if (MPI_STATUSES_IGNORE != statuses) {
405-
statuses[i] = request->req_status;
406-
}
409+
for (size_t i = 0; i < num_requests_done; i++) {
410+
request = requests[indices[i]];
411+
assert( REQUEST_COMPLETE(request) );
412+
/* Per note above, we have to call gen request query_fn even
413+
if STATUS_IGNORE was provided */
414+
if (OMPI_REQUEST_GEN == request->req_type) {
415+
ompi_grequest_invoke_query(request, &request->req_status);
416+
}
417+
if (MPI_STATUSES_IGNORE != statuses) {
418+
statuses[i] = request->req_status;
419+
}
407420

408-
if (MPI_SUCCESS != request->req_status.MPI_ERROR) {
409-
rc = MPI_ERR_IN_STATUS;
410-
}
421+
if (MPI_SUCCESS != request->req_status.MPI_ERROR) {
422+
rc = MPI_ERR_IN_STATUS;
423+
}
411424

412-
if( request->req_persistent ) {
413-
request->req_state = OMPI_REQUEST_INACTIVE;
414-
} else {
415-
/* Only free the request if there was no error */
416-
if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
417-
int tmp;
418-
tmp = ompi_request_free(&(requests[indices[i]]));
419-
if (OMPI_SUCCESS != tmp) {
420-
return tmp;
421-
}
425+
if( request->req_persistent ) {
426+
request->req_state = OMPI_REQUEST_INACTIVE;
427+
} else {
428+
/* Only free the request if there was no error */
429+
if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
430+
int tmp;
431+
tmp = ompi_request_free(&(requests[indices[i]]));
432+
if (OMPI_SUCCESS != tmp) {
433+
return tmp;
422434
}
423435
}
424436
}
425437
}
438+
426439
return rc;
427440
}

0 commit comments

Comments
 (0)