Skip to content

Commit ec4a235

Browse files
committed
Allow a TCP proc release during the create.
This is mostly for error cases, where we need to release the newly created proc. Currently the code deadlocks because the endpoint lock is help at the release and the lock is not recursive. Aslo added some code to print the IP addresses that don't match during the TCP connection step. Signed-off-by: George Bosilca <[email protected]>
1 parent 05b568a commit ec4a235

File tree

1 file changed

+37
-9
lines changed

1 file changed

+37
-9
lines changed

opal/mca/btl/tcp/btl_tcp_proc.c

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -125,16 +125,18 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
125125
return btl_proc;
126126
}
127127

128-
do {
128+
do { /* This loop is only necessary so that we can break out of the serial code */
129129
btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
130130
if(NULL == btl_proc) {
131131
rc = OPAL_ERR_OUT_OF_RESOURCE;
132132
break;
133133
}
134134

135-
btl_proc->proc_opal = proc;
136-
137-
OBJ_RETAIN(btl_proc->proc_opal);
135+
/* Retain the proc, but don't store the ref into the btl_proc just yet. This
136+
* provides a way to release the btl_proc in case of failure without having to
137+
* unlock the mutex.
138+
*/
139+
OBJ_RETAIN(proc);
138140

139141
/* lookup tcp parameters exported by this proc */
140142
OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version,
@@ -180,12 +182,14 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
180182
} while (0);
181183

182184
if (OPAL_SUCCESS == rc) {
185+
btl_proc->proc_opal = proc; /* link with the proc */
183186
/* add to hash table of all proc instance. */
184187
opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
185188
proc->proc_name, btl_proc);
186189
} else {
187190
if (btl_proc) {
188-
OBJ_RELEASE(btl_proc);
191+
OBJ_RELEASE(btl_proc); /* release the local proc */
192+
OBJ_RELEASE(proc); /* and the ref on the OMPI proc */
189193
btl_proc = NULL;
190194
}
191195
}
@@ -820,12 +824,36 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
820824
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
821825
return;
822826
}
823-
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
824-
opal_output(0, "btl: tcp: Incoming connection from %s does not match known addresses for peer %s. Drop !\n",
825-
opal_net_get_hostname((struct sockaddr*)addr),
826-
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
827827
/* No further use of this socket. Close it */
828828
CLOSE_THE_SOCKET(sd);
829+
{
830+
size_t len = 1024;
831+
char* addr_str = (char*)malloc(len);
832+
if( NULL != addr_str ) {
833+
memset(addr_str, 0, len);
834+
for (size_t i = 0; i < btl_proc->proc_endpoint_count; i++) {
835+
mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
836+
if (btl_endpoint->endpoint_addr->addr_family != addr->sa_family) {
837+
continue;
838+
}
839+
840+
if (addr_str[0] != '\0') {
841+
strncat(addr_str, ", ", len);
842+
len -= 2;
843+
}
844+
strncat(addr_str, inet_ntop(AF_INET6, (void*)(struct in6_addr*)&btl_endpoint->endpoint_addr->addr_inet,
845+
addr_str + 1024 - len, INET6_ADDRSTRLEN), len);
846+
len = 1024 - strlen(addr_str);
847+
}
848+
}
849+
opal_output(0, "btl: tcp: Incoming connection from %s does not match known addresses for peer %s [hostname=%s addr=%s]. Drop !\n",
850+
opal_net_get_hostname((struct sockaddr*)addr),
851+
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name),
852+
btl_proc->proc_opal->proc_hostname,
853+
addr_str);
854+
free(addr_str);
855+
}
856+
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
829857
}
830858

831859
/*

0 commit comments

Comments
 (0)