|
22 | 22 | * Copyright (c) 2018-2019 Triad National Security, LLC. All rights |
23 | 23 | * reserved. |
24 | 24 | * Copyright (c) 2022 IBM Corporation. All rights reserved. |
| 25 | + * Copyright (c) 2024 Google, LLC. All rights reserved. |
25 | 26 | * $COPYRIGHT$ |
26 | 27 | * |
27 | 28 | * Additional copyrights may follow |
@@ -1110,6 +1111,12 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq) |
1110 | 1111 |
|
1111 | 1112 | range = get_send_range(sendreq); |
1112 | 1113 |
|
| 1114 | + if (NULL != sendreq->rdma_frag) { |
| 1115 | + /* this request was first attempted with RDMA but is now using send/recv */ |
| 1116 | + MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag); |
| 1117 | + sendreq->rdma_frag = NULL; |
| 1118 | + } |
| 1119 | + |
1113 | 1120 | while(range && (false == sendreq->req_throttle_sends || |
1114 | 1121 | sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) { |
1115 | 1122 | mca_pml_ob1_frag_hdr_t* hdr; |
@@ -1268,30 +1275,31 @@ static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *f |
1268 | 1275 | mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req; |
1269 | 1276 | mca_bml_base_btl_t *bml_btl = frag->rdma_bml; |
1270 | 1277 |
|
1271 | | - if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { |
| 1278 | + if (frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { |
1272 | 1279 | /* queue the frag for later if there was a resource error */ |
1273 | 1280 | OPAL_THREAD_LOCK(&mca_pml_ob1.lock); |
1274 | 1281 | opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag); |
1275 | 1282 | OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); |
1276 | | - } else { |
| 1283 | + return; |
| 1284 | + } |
| 1285 | + |
1277 | 1286 | #if OPAL_ENABLE_FT |
1278 | | - if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) { |
1279 | | - return; |
1280 | | - } |
1281 | | -#endif /* OPAL_ENABLE_FT */ |
1282 | | - /* tell receiver to deregister memory */ |
1283 | | - mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, |
1284 | | - frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, |
1285 | | - OPAL_ERR_TEMP_OUT_OF_RESOURCE); |
1286 | | - |
1287 | | - /* send fragment by copy in/out */ |
1288 | | - mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, |
1289 | | - frag->rdma_length); |
1290 | | - /* if a pointer to a receive request is not set it means that |
1291 | | - * ACK was not yet received. Don't schedule sends before ACK */ |
1292 | | - if (NULL != sendreq->req_recv.pval) |
1293 | | - mca_pml_ob1_send_request_schedule (sendreq); |
| 1287 | + if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) { |
| 1288 | + return; |
1294 | 1289 | } |
| 1290 | +#endif /* OPAL_ENABLE_FT */ |
| 1291 | + /* tell receiver to deregister memory */ |
| 1292 | + mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, |
| 1293 | + frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, |
| 1294 | + OPAL_ERR_TEMP_OUT_OF_RESOURCE); |
| 1295 | + |
| 1296 | + /* send fragment by copy in/out */ |
| 1297 | + mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, |
| 1298 | + frag->rdma_length); |
| 1299 | + /* if a pointer to a receive request is not set it means that |
| 1300 | + * ACK was not yet received. Don't schedule sends before ACK */ |
| 1301 | + if (NULL != sendreq->req_recv.pval) |
| 1302 | + mca_pml_ob1_send_request_schedule (sendreq); |
1295 | 1303 | } |
1296 | 1304 |
|
1297 | 1305 | /** |
|
0 commit comments