From 59bb79b46fb805211738aa35a794d3aca0e2d9fb Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Sat, 14 Apr 2018 15:55:48 -0400
Subject: [PATCH] Add/remove OB1 and CUDA progress. Provide support for
 dynamically adding and removing the progress function for OB1 and CUDA.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 ompi/mca/pml/ob1/pml_ob1.h          |  2 +-
 ompi/mca/pml/ob1/pml_ob1_progress.c | 29 ++++++++++++++++++++---------
 ompi/mca/pml/ob1/pml_ob1_recvreq.c  |  1 +
 ompi/mca/pml/ob1/pml_ob1_sendreq.c  | 18 ++++++++++++++----
 ompi/mca/pml/ob1/pml_ob1_sendreq.h  |  3 +--
 5 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h
index 1f4bfbb5899..4ec8dbf3905 100644
--- a/ompi/mca/pml/ob1/pml_ob1.h
+++ b/ompi/mca/pml/ob1/pml_ob1.h
@@ -302,7 +302,7 @@ void mca_pml_ob1_process_pending_rdma(void);
         if(opal_list_get_size(&mca_pml_ob1.recv_pending))       \
             mca_pml_ob1_recv_request_process_pending();         \
         if(opal_list_get_size(&mca_pml_ob1.send_pending))       \
-            mca_pml_ob1_send_request_process_pending(bml_btl);  \
+            (void)mca_pml_ob1_send_request_process_pending(bml_btl);  \
         if(opal_list_get_size(&mca_pml_ob1.rdma_pending))       \
             mca_pml_ob1_process_pending_rdma();                 \
     } while (0)
diff --git a/ompi/mca/pml/ob1/pml_ob1_progress.c b/ompi/mca/pml/ob1/pml_ob1_progress.c
index e1f84e796b4..4a31edac05a 100644
--- a/ompi/mca/pml/ob1/pml_ob1_progress.c
+++ b/ompi/mca/pml/ob1/pml_ob1_progress.c
@@ -49,6 +49,9 @@ static inline int mca_pml_ob1_process_pending_cuda_async_copies(void)
     } while (progress > 0);
     /* Consider progressing dtoh events here in future */
 
+    /* Update the number of potential pending events */
+    mca_pml_ob1_enable_progress(-count);
+
     return count;
 }
 #endif /* OPAL_CUDA_SUPPORT */
@@ -56,12 +59,22 @@ static inline int mca_pml_ob1_process_pending_cuda_async_copies(void)
 static int mca_pml_ob1_progress_needed = 0;
 int mca_pml_ob1_enable_progress(int32_t count)
 {
+    if( 0 == count ) return 0;  /* nothing to do */
     int32_t progress_count = OPAL_ATOMIC_ADD_FETCH32(&mca_pml_ob1_progress_needed, count);
+    assert(progress_count >= 0);
     if( 1 < progress_count )
-        return 0;  /* progress was already on */
+        return 0;  /* progress was already on and no change necessary */
 
-    opal_progress_register(mca_pml_ob1_progress);
-    return 1;
+    if( 0 == progress_count ) {  /* only way to get here is if count is negative */
+        opal_progress_unregister(mca_pml_ob1_progress);
+        return 1;
+    }
+    if( count > 0 ) {
+        opal_progress_register(mca_pml_ob1_progress);
+        return 1;
+    }
+    /* count was negative */
+    return 0;
 }
 
 int mca_pml_ob1_progress(void)
@@ -87,11 +100,11 @@ int mca_pml_ob1_progress(void)
         switch(pending_type) {
         case MCA_PML_OB1_SEND_PENDING_NONE:
             assert(0);
-            return 0;
+            goto update_pending_and_return;
         case MCA_PML_OB1_SEND_PENDING_SCHEDULE:
             if( mca_pml_ob1_send_request_schedule_exclusive(sendreq) ==
                 OMPI_ERR_OUT_OF_RESOURCE ) {
-                return 0;
+                goto update_pending_and_return;
             }
             completed_requests++;
             break;
@@ -118,11 +131,9 @@ int mca_pml_ob1_progress(void)
         }
     }
 
+  update_pending_and_return:
     if( 0 != completed_requests ) {
-        j = OPAL_ATOMIC_ADD_FETCH32(&mca_pml_ob1_progress_needed, -completed_requests);
-        if( 0 == j ) {
-            opal_progress_unregister(mca_pml_ob1_progress);
-        }
+        mca_pml_ob1_enable_progress(-completed_requests);
     }
 
     return completed_requests;
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index 9ccb27e1af4..5038e7b623a 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -831,6 +831,7 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq
         (btl->btl_flags & MCA_BTL_FLAGS_CUDA_COPY_ASYNC_RECV)) {
         void *strm = mca_common_cuda_get_htod_stream();
         opal_cuda_set_copy_function_async(&recvreq->req_recv.req_base.req_convertor, strm);
+        mca_pml_ob1_enable_progress(1);
     }
 #endif
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index a2aecae09ac..39af7c12755 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -43,9 +43,10 @@
 OBJ_CLASS_INSTANCE(mca_pml_ob1_send_range_t, opal_free_list_item_t,
         NULL, NULL);
 
-void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
+int mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
 {
     int rc, i, s = opal_list_get_size(&mca_pml_ob1.send_pending);
+    int completed_requests = 0;
 
     /* advance pending requests */
     for(i = 0; i < s; i++) {
@@ -61,8 +62,9 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
         case MCA_PML_OB1_SEND_PENDING_SCHEDULE:
             rc = mca_pml_ob1_send_request_schedule_exclusive(sendreq);
             if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
-                return;
+                goto update_pending_and_return;
             }
+            completed_requests++;
             break;
         case MCA_PML_OB1_SEND_PENDING_START:
             send_dst = mca_bml_base_btl_array_find(
@@ -79,8 +81,9 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
                      * list to minimize reordering and give up for now. */
                     add_request_to_send_pending(sendreq,
                             MCA_PML_OB1_SEND_PENDING_START, false);
-                    return;
+                    goto update_pending_and_return;
                 }
+                completed_requests++;
             }
             break;
         default:
@@ -89,6 +92,12 @@ void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl)
             break;
         }
     }
+
+ update_pending_and_return:
+    if( 0 != completed_requests ) {
+        mca_pml_ob1_enable_progress(-completed_requests);
+    }
+    return completed_requests;
 }
 
 /*
@@ -938,7 +947,8 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq)
         if( OPAL_UNLIKELY(num_fail == range->range_btl_cnt) ) {
             /*TODO : assert(sendreq->req_pending == MCA_PML_OB1_SEND_PENDING_NONE); */
             add_request_to_send_pending(sendreq,
-                    MCA_PML_OB1_SEND_PENDING_SCHEDULE, true);
+                                        MCA_PML_OB1_SEND_PENDING_SCHEDULE, true);
+            mca_pml_ob1_enable_progress(1);
             /* Note that request remains locked. send_request_process_pending()
              * function will call shedule_exclusive() directly without taking
              * the lock */
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
index be36c3f2ac4..014d1c9ced7 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@@ -99,7 +99,6 @@ add_request_to_send_pending(mca_pml_ob1_send_request_t* sendreq,
         opal_list_prepend(&mca_pml_ob1.send_pending, item);
 
     OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
-    mca_pml_ob1_enable_progress(1);
 }
 
 static inline mca_pml_ob1_send_request_t*
@@ -508,7 +507,7 @@ int mca_pml_ob1_send_request_put_frag(mca_pml_ob1_rdma_frag_t* frag);
  * available. bml_btl passed to the function doesn't represents sendreq
  * destination, it represents BTL on which resource was freed, so only this BTL
  * should be considered for sending packets */
-void mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl);
+int mca_pml_ob1_send_request_process_pending(mca_bml_base_btl_t *bml_btl);
 
 void mca_pml_ob1_send_request_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
                 uint64_t send_offset, uint64_t send_length);