|
75 | 75 | #include "orte/mca/oob/tcp/oob_tcp_common.h"
|
76 | 76 | #include "orte/mca/oob/tcp/oob_tcp_connection.h"
|
77 | 77 |
|
| 78 | +#define OOB_SEND_MAX_RETRIES 3 |
| 79 | + |
78 | 80 | void mca_oob_tcp_queue_msg(int sd, short args, void *cbdata)
|
79 | 81 | {
|
80 | 82 | mca_oob_tcp_send_t *snd = (mca_oob_tcp_send_t*)cbdata;
|
@@ -105,7 +107,7 @@ void mca_oob_tcp_queue_msg(int sd, short args, void *cbdata)
|
105 | 107 | static int send_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_send_t* msg)
|
106 | 108 | {
|
107 | 109 | struct iovec iov[2];
|
108 |
| - int iov_count; |
| 110 | + int iov_count, retries = 0; |
109 | 111 | ssize_t remain = msg->sdbytes, rc;
|
110 | 112 |
|
111 | 113 | OPAL_TIMING_EVENT((&tm_oob, "to %s %d bytes",
|
@@ -146,12 +148,20 @@ static int send_msg(mca_oob_tcp_peer_t* peer, mca_oob_tcp_send_t* msg)
|
146 | 148 | * but let the event lib cycle so other messages
|
147 | 149 | * can progress while this socket is busy
|
148 | 150 | */
|
| 151 | + ++retries; |
| 152 | + if (retries < OOB_SEND_MAX_RETRIES) { |
| 153 | + goto retry; |
| 154 | + } |
149 | 155 | return ORTE_ERR_RESOURCE_BUSY;
|
150 | 156 | } else if (opal_socket_errno == EWOULDBLOCK) {
|
151 | 157 | /* tell the caller to keep this message on active,
|
152 | 158 | * but let the event lib cycle so other messages
|
153 | 159 | * can progress while this socket is busy
|
154 | 160 | */
|
| 161 | + ++retries; |
| 162 | + if (retries < OOB_SEND_MAX_RETRIES) { |
| 163 | + goto retry; |
| 164 | + } |
155 | 165 | return ORTE_ERR_WOULD_BLOCK;
|
156 | 166 | } else {
|
157 | 167 | /* we hit an error and cannot progress this message */
|
|
0 commit comments