Skip to content

Commit c667719

Browse files
authored
Merge pull request #3955 from mohanasudhan/master
Btl tcp: Improved diagnostic output and failure mode
2 parents b67b1e8 + fc32ae4 commit c667719

File tree

7 files changed

+492
-101
lines changed

7 files changed

+492
-101
lines changed

opal/mca/btl/tcp/btl_tcp.c

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,14 @@
3131
#include "opal/mca/mpool/base/base.h"
3232
#include "opal/mca/mpool/mpool.h"
3333
#include "opal/mca/btl/base/btl_base_error.h"
34+
#include "opal/opal_socket_errno.h"
3435

3536
#include "btl_tcp.h"
3637
#include "btl_tcp_frag.h"
3738
#include "btl_tcp_proc.h"
3839
#include "btl_tcp_endpoint.h"
3940

41+
4042
mca_btl_tcp_module_t mca_btl_tcp_module = {
4143
.super = {
4244
.btl_component = &mca_btl_tcp_component.super,
@@ -531,3 +533,68 @@ void mca_btl_tcp_dump(struct mca_btl_base_module_t* base_btl,
531533
}
532534
#endif /* OPAL_ENABLE_DEBUG && WANT_PEER_DUMP */
533535
}
536+
537+
538+
/*
539+
* A blocking recv for both blocking and non-blocking socket.
540+
* Used to receive the small amount of connection information
541+
* that identifies the endpoints
542+
*
543+
* when the socket is blocking (the caller introduces timeout)
544+
* which happens during initial handshake otherwise socket is
545+
* non-blocking most of the time.
546+
*/
547+
548+
int mca_btl_tcp_recv_blocking(int sd, void* data, size_t size)
549+
{
550+
unsigned char* ptr = (unsigned char*)data;
551+
size_t cnt = 0;
552+
while (cnt < size) {
553+
int retval = recv(sd, ((char *)ptr) + cnt, size - cnt, 0);
554+
/* remote closed connection */
555+
if (0 == retval) {
556+
BTL_ERROR(("remote peer unexpectedly closed connection while I was waiting for blocking message"));
557+
return -1;
558+
}
559+
560+
/* socket is non-blocking so handle errors */
561+
if (retval < 0) {
562+
if (opal_socket_errno != EINTR &&
563+
opal_socket_errno != EAGAIN &&
564+
opal_socket_errno != EWOULDBLOCK) {
565+
BTL_ERROR(("recv(%d) failed: %s (%d)", sd, strerror(opal_socket_errno), opal_socket_errno));
566+
return -1;
567+
}
568+
continue;
569+
}
570+
cnt += retval;
571+
}
572+
return cnt;
573+
}
574+
575+
576+
/*
577+
* A blocking send on a non-blocking socket. Used to send the small
578+
* amount of connection information that identifies the endpoints
579+
* endpoint.
580+
*/
581+
582+
int mca_btl_tcp_send_blocking(int sd, const void* data, size_t size)
583+
{
584+
unsigned char* ptr = (unsigned char*)data;
585+
size_t cnt = 0;
586+
while(cnt < size) {
587+
int retval = send(sd, ((const char *)ptr) + cnt, size - cnt, 0);
588+
if (retval < 0) {
589+
if (opal_socket_errno != EINTR &&
590+
opal_socket_errno != EAGAIN &&
591+
opal_socket_errno != EWOULDBLOCK) {
592+
BTL_ERROR(("send() failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno));
593+
return -1;
594+
}
595+
continue;
596+
}
597+
cnt += retval;
598+
}
599+
return cnt;
600+
}

opal/mca/btl/tcp/btl_tcp.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,5 +351,23 @@ mca_btl_tcp_dump(struct mca_btl_base_module_t* btl,
351351
*/
352352
int mca_btl_tcp_ft_event(int state);
353353

354+
/*
355+
* A blocking send on a non-blocking socket. Used to send the small
356+
* amount of connection information that identifies the endpoints
357+
* endpoint.
358+
*/
359+
int mca_btl_tcp_send_blocking(int sd, const void* data, size_t size);
360+
361+
/*
362+
* A blocking recv for both blocking and non-blocking socket.
363+
* Used to receive the small amount of connection information
364+
* that identifies the endpoints
365+
*
366+
* when the socket is blocking (the caller introduces timeout)
367+
* which happens during initial handshake otherwise socket is
368+
* non-blocking most of the time.
369+
*/
370+
int mca_btl_tcp_recv_blocking(int sd, void* data, size_t size);
371+
354372
END_C_DECLS
355373
#endif

0 commit comments

Comments
 (0)