Skip to content

Commit 03f4a85

Browse files
committed
btl/tcp: fix add_procs race condition
This commit fixes a race between a thread calling the tcp btl's add_procs and a thread processing an incomming connection. The race occured because the add_procs thread adds a newly created proc object to the hash table *before* the object is fully initialized. The connection thread then attempts to use the object before the endpoints array on the object has beeen allocation. The fix is to only add the proc to the hash table after it has been completely initialized. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 01c90d4 commit 03f4a85

File tree

1 file changed

+55
-40
lines changed

1 file changed

+55
-40
lines changed

opal/mca/btl/tcp/btl_tcp_proc.c

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
1515
* Copyright (c) 2014-2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
17-
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
17+
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
1818
* reserved.
1919
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
2020
* $COPYRIGHT$
@@ -122,52 +122,53 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
122122
return btl_proc;
123123
}
124124

125-
btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
126-
if(NULL == btl_proc)
127-
return NULL;
128-
btl_proc->proc_opal = proc;
129-
OBJ_RETAIN(btl_proc->proc_opal);
125+
do {
126+
btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
127+
if(NULL == btl_proc) {
128+
rc = OPAL_ERR_OUT_OF_RESOURCE;
129+
break;
130+
}
130131

131-
/* add to hash table of all proc instance */
132-
opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
133-
proc->proc_name, btl_proc);
134-
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
132+
btl_proc->proc_opal = proc;
135133

136-
/* lookup tcp parameters exported by this proc */
137-
OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version,
138-
&proc->proc_name, (uint8_t**)&btl_proc->proc_addrs, &size);
139-
if(rc != OPAL_SUCCESS) {
140-
if(OPAL_ERR_NOT_FOUND != rc)
141-
BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc));
142-
OBJ_RELEASE(btl_proc);
143-
return NULL;
144-
}
145-
if(0 != (size % sizeof(mca_btl_tcp_addr_t))) {
146-
BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n",
147-
(unsigned long) size, (unsigned long)sizeof(mca_btl_tcp_addr_t)));
148-
return NULL;
149-
}
150-
btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_addr_t);
151-
152-
/* allocate space for endpoint array - one for each exported address */
153-
btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
154-
malloc((1 + btl_proc->proc_addr_count) *
155-
sizeof(mca_btl_base_endpoint_t*));
156-
if(NULL == btl_proc->proc_endpoints) {
157-
OBJ_RELEASE(btl_proc);
158-
return NULL;
159-
}
134+
OBJ_RETAIN(btl_proc->proc_opal);
135+
136+
/* lookup tcp parameters exported by this proc */
137+
OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version,
138+
&proc->proc_name, (uint8_t**)&btl_proc->proc_addrs, &size);
139+
if(rc != OPAL_SUCCESS) {
140+
if(OPAL_ERR_NOT_FOUND != rc)
141+
BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc));
142+
break;
143+
}
144+
145+
if(0 != (size % sizeof(mca_btl_tcp_addr_t))) {
146+
BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n",
147+
(unsigned long) size, (unsigned long)sizeof(mca_btl_tcp_addr_t)));
148+
rc = OPAL_ERROR;
149+
break;
150+
}
151+
152+
btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_addr_t);
153+
154+
/* allocate space for endpoint array - one for each exported address */
155+
btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
156+
malloc((1 + btl_proc->proc_addr_count) *
157+
sizeof(mca_btl_base_endpoint_t*));
158+
if(NULL == btl_proc->proc_endpoints) {
159+
rc = OPAL_ERR_OUT_OF_RESOURCE;
160+
break;
161+
}
162+
163+
if(NULL == mca_btl_tcp_component.tcp_local && (proc == opal_proc_local_get())) {
164+
mca_btl_tcp_component.tcp_local = btl_proc;
165+
}
160166

161-
if(NULL == mca_btl_tcp_component.tcp_local && (proc == opal_proc_local_get())) {
162-
mca_btl_tcp_component.tcp_local = btl_proc;
163-
}
164-
{
165167
/* convert the OPAL addr_family field to OS constants,
166168
* so we can check for AF_INET (or AF_INET6) and don't have
167169
* to deal with byte ordering anymore.
168170
*/
169-
unsigned int i;
170-
for (i = 0; i < btl_proc->proc_addr_count; i++) {
171+
for (unsigned int i = 0; i < btl_proc->proc_addr_count; i++) {
171172
if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) {
172173
btl_proc->proc_addrs[i].addr_family = AF_INET;
173174
}
@@ -177,7 +178,21 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
177178
}
178179
#endif
179180
}
181+
} while (0);
182+
183+
if (OPAL_SUCCESS == rc) {
184+
/* add to hash table of all proc instance. */
185+
opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
186+
proc->proc_name, btl_proc);
187+
} else {
188+
if (btl_proc) {
189+
OBJ_RELEASE(btl_proc);
190+
btl_proc = NULL;
191+
}
180192
}
193+
194+
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
195+
181196
return btl_proc;
182197
}
183198

0 commit comments

Comments
 (0)