Skip to content

Commit 7e80679

Browse files
committed
rcache/udreg: bug fixes
This commit fixes bugs that caused hangs or crashes when running out of registration resources. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent d787492 commit 7e80679

File tree

1 file changed

+50
-35
lines changed

1 file changed

+50
-35
lines changed

opal/mca/rcache/udreg/rcache_udreg_module.c

Lines changed: 50 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ static void *mca_rcache_udreg_reg_func (void *addr, uint64_t size, void *reg_con
144144
udreg_reg->rcache = reg_context;
145145
udreg_reg->base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
146146
udreg_reg->bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1;
147+
udreg_reg->ref_count = 0;
147148

148149
addr = (void *) udreg_reg->base;
149150
size = (uint64_t) (udreg_reg->bound - udreg_reg->base + 1);
@@ -161,7 +162,8 @@ static void *mca_rcache_udreg_reg_func (void *addr, uint64_t size, void *reg_con
161162
opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_rcache_base_framework.framework_output,
162163
"rcache/udreg: could not register memory. rc: %d", rc);
163164
opal_free_list_return (&rcache_udreg->reg_list, item);
164-
udreg_reg = NULL;
165+
/* NTH: this is the only way to get UDReg_Register to recognize a failure */
166+
udreg_reg = UDREG_DEVICE_REG_FAILED;
165167
}
166168

167169
return udreg_reg;
@@ -173,13 +175,9 @@ static uint32_t mca_rcache_udreg_dereg_func (void *device_data, void *dreg_conte
173175
mca_rcache_base_registration_t *udreg_reg = (mca_rcache_base_registration_t *) device_data;
174176
int rc;
175177

176-
if (udreg_reg->ref_count) {
177-
/* there are still users of this registration. leave it alone */
178-
return 0;
179-
}
178+
assert (udreg_reg->ref_count == 0);
180179

181180
rc = rcache_udreg->resources.base.deregister_mem (rcache_udreg->resources.base.reg_data, udreg_reg);
182-
183181
if (OPAL_LIKELY(OPAL_SUCCESS == rc)) {
184182
opal_free_list_return (&rcache_udreg->reg_list,
185183
(opal_free_list_item_t *) udreg_reg);
@@ -208,8 +206,9 @@ static int mca_rcache_udreg_register(mca_rcache_base_module_t *rcache, void *add
208206
mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache;
209207
mca_rcache_base_registration_t *udreg_reg, *old_reg;
210208
bool bypass_cache = !!(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS);
211-
udreg_entry_t *udreg_entry;
212-
udreg_return_t urc;
209+
const unsigned int page_size = opal_getpagesize ();
210+
unsigned char *base, *bound;
211+
udreg_entry_t *udreg_entry = NULL;
213212

214213
*reg = NULL;
215214

@@ -219,60 +218,76 @@ static int mca_rcache_udreg_register(mca_rcache_base_module_t *rcache, void *add
219218
rcache_udreg->requested_access_flags = access_flags;
220219
rcache_udreg->requested_flags = flags;
221220

221+
base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
222+
bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1;
223+
224+
addr = base;
225+
size = (size_t) (uintptr_t) (bound - base) + 1;
226+
222227
if (false == bypass_cache) {
223228
/* Get a udreg entry for this region */
224229
do {
225230
opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
226-
"rcache/udreg: registering region {%p, %p} with udreg", addr, (void *)((intptr_t) addr + size));
227-
while (UDREG_RC_SUCCESS !=
228-
(urc = UDREG_Register (rcache_udreg->udreg_handle, addr, size, &udreg_entry))) {
231+
"rcache/udreg: XXX registering region {%p, %p} with udreg", addr, (void *)((intptr_t) addr + size));
232+
while (UDREG_RC_SUCCESS != UDREG_Register (rcache_udreg->udreg_handle, addr, size, &udreg_entry)) {
229233
/* try to remove one unused reg and retry */
234+
opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
235+
"calling evict!");
230236
if (!mca_rcache_udreg_evict (rcache)) {
231237
opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
232-
"rcache/udreg: could not register memory with udreg. udreg rc: %d", urc);
238+
"rcache/udreg: could not register memory with udreg");
233239
OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
234240
return OPAL_ERR_OUT_OF_RESOURCE;
235241
}
236242
}
237243

238244
udreg_reg = (mca_rcache_base_registration_t *) udreg_entry->device_data;
239-
240-
if ((udreg_reg->access_flags & access_flags) == access_flags) {
245+
if (NULL != udreg_reg && (udreg_reg->access_flags & access_flags) == access_flags) {
241246
/* sufficient access */
242247
break;
243248
}
244249

245250
old_reg = udreg_reg;
246251

247-
/* to not confuse udreg make sure the new registration covers the same address
248-
* range as the old one. */
249-
addr = old_reg->base;
250-
size = (size_t)((intptr_t) old_reg->bound - (intptr_t) old_reg->base);
252+
if (old_reg) {
253+
/* to not confuse udreg make sure the new registration covers the same address
254+
* range as the old one. */
255+
addr = old_reg->base;
256+
size = (size_t)((intptr_t) old_reg->bound - (intptr_t) old_reg->base);
257+
258+
/* make the new access flags more permissive */
259+
access_flags |= old_reg->access_flags;
260+
261+
if (!old_reg->ref_count) {
262+
/* deregister the region before attempting to re-register */
263+
mca_rcache_udreg_dereg_func (old_reg, rcache);
264+
udreg_entry->device_data = NULL;
265+
old_reg = NULL;
266+
} else {
267+
/* ensure that mca_rcache_udreg_deregister does not call into udreg since
268+
* we are forcefully evicting the registration here */
269+
old_reg->flags |= MCA_RCACHE_FLAGS_CACHE_BYPASS | MCA_RCACHE_FLAGS_INVALID;
270+
}
271+
}
251272

252-
/* make the new access flags more permissive */
253-
rcache_udreg->requested_access_flags = access_flags | old_reg->access_flags;
273+
rcache_udreg->requested_access_flags = access_flags;
254274

255275
/* get a new registration */
256-
udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache);
257-
if (NULL == udreg_reg) {
258-
OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
259-
return OPAL_ERR_OUT_OF_RESOURCE;
276+
while (UDREG_DEVICE_REG_FAILED == (udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache))) {
277+
if (!mca_rcache_udreg_evict (rcache)) {
278+
opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
279+
"rcache/udreg: could not register memory with udreg");
280+
OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
281+
return OPAL_ERR_OUT_OF_RESOURCE;
282+
}
260283
}
261284

262285
/* update the device data with the new registration */
263286
udreg_entry->device_data = udreg_reg;
264-
265-
/* ensure that mca_rcache_udreg_deregister does not call into udreg since
266-
* we are forcefully evicting the registration here */
267-
old_reg->flags |= MCA_RCACHE_FLAGS_CACHE_BYPASS | MCA_RCACHE_FLAGS_INVALID;
268-
269-
mca_rcache_udreg_dereg_func (old_reg, rcache);
270287
} while (0);
271-
272-
udreg_reg->rcache_context = udreg_entry;
273288
} else {
274289
/* if cache bypass is requested don't use the udreg cache */
275-
while (NULL == (udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache))) {
290+
while (UDREG_DEVICE_REG_FAILED == (udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache))) {
276291
/* try to remove one unused reg and retry */
277292
if (!mca_rcache_udreg_evict (rcache)) {
278293
opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
@@ -281,13 +296,13 @@ static int mca_rcache_udreg_register(mca_rcache_base_module_t *rcache, void *add
281296
return OPAL_ERR_OUT_OF_RESOURCE;
282297
}
283298
}
284-
udreg_reg->rcache_context = NULL;
285299
}
286300

287301
OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
288302

289303
*reg = udreg_reg;
290304
++udreg_reg->ref_count;
305+
udreg_reg->rcache_context = udreg_entry;
291306

292307
return OPAL_SUCCESS;
293308
}
@@ -312,7 +327,7 @@ static int mca_rcache_udreg_deregister(mca_rcache_base_module_t *rcache,
312327
OPAL_THREAD_LOCK(&rcache_udreg->lock);
313328
UDREG_DecrRefcount (rcache_udreg->udreg_handle, reg->rcache_context);
314329
OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
315-
} else {
330+
} else if (!reg->ref_count) {
316331
mca_rcache_udreg_dereg_func (reg, rcache);
317332
}
318333

0 commit comments

Comments
 (0)