Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit 3e4ff53

Browse files
committed
mpool/rgpusm: update for rcache threading fixes
This commit brings the rgpusm mpool in line with the changes made to the rcache to make it thread safe. There is no master equivalent of this commit. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent cedc56e commit 3e4ff53

File tree

1 file changed

+39
-69
lines changed

1 file changed

+39
-69
lines changed

opal/mca/mpool/rgpusm/mpool_rgpusm_module.c

Lines changed: 39 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
1616
* Copyright (c) 2010 IBM Corporation. All rights reserved.
1717
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
18-
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
18+
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
1919
* reserved.
2020
*
2121
* $COPYRIGHT$
@@ -113,11 +113,11 @@ static inline bool mca_mpool_rgpusm_deregister_lru (mca_mpool_base_module_t *mpo
113113
mpool->rcache->rcache_delete(mpool->rcache, old_reg);
114114

115115
/* Drop the rcache lock while we deregister the memory */
116-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
116+
opal_mutex_unlock (&mpool->rcache->lock);
117117
assert(old_reg->ref_count == 0);
118118
rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
119119
old_reg);
120-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
120+
opal_mutex_lock (&mpool->rcache->lock);
121121

122122
/* This introduces a potential leak of registrations if
123123
the deregistration fails to occur as we no longer have
@@ -242,7 +242,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
242242
}
243243

244244
/* Check to see if memory is registered and stored in the cache. */
245-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
245+
opal_mutex_lock (&mpool->rcache->lock);
246246
mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
247247

248248
/* If *reg is not NULL, we have a registration. Let us see if the
@@ -306,7 +306,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
306306
(opal_list_item_t*)(*reg));
307307
}
308308
(*reg)->ref_count++;
309-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
309+
opal_mutex_unlock (&mpool->rcache->lock);
310310
opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count);
311311
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
312312
"RGPUSM: Found entry in cache addr=%p, size=%d", addr, (int)size);
@@ -322,7 +322,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
322322

323323
item = opal_free_list_get (&mpool_rgpusm->reg_list);
324324
if(NULL == item) {
325-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
325+
opal_mutex_unlock (&mpool->rcache->lock);
326326
return OPAL_ERR_OUT_OF_RESOURCE;
327327
}
328328
rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item;
@@ -399,7 +399,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
399399
}
400400

401401
if(rc != OPAL_SUCCESS) {
402-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
402+
opal_mutex_unlock (&mpool->rcache->lock);
403403
opal_free_list_return (&mpool_rgpusm->reg_list, item);
404404
return rc;
405405
}
@@ -439,7 +439,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
439439
}
440440

441441
if(rc != OPAL_SUCCESS) {
442-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
442+
opal_mutex_unlock (&mpool->rcache->lock);
443443
opal_free_list_return (&mpool_rgpusm->reg_list, item);
444444
/* We cannot recover from this. We can be here if the size of
445445
* the cache is smaller than the amount of memory we are
@@ -454,10 +454,8 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
454454

455455
rgpusm_reg->base.ref_count++;
456456
*reg = (mca_mpool_base_registration_t *)rgpusm_reg;
457-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
457+
opal_mutex_unlock (&mpool->rcache->lock);
458458

459-
/* Cleanup any vmas that we have deferred deletion on */
460-
mpool->rcache->rcache_clean(mpool->rcache);
461459
return OPAL_SUCCESS;
462460
}
463461

@@ -483,7 +481,7 @@ int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t *mpool, void *addr,
483481
base = addr;
484482
bound = base + size - 1; /* To keep cache hits working correctly */
485483

486-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
484+
opal_mutex_lock (&mpool->rcache->lock);
487485
opal_output(-1, "Looking for addr=%p, size=%d", addr, (int)size);
488486
rc = mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
489487
if(*reg != NULL && mca_mpool_rgpusm_component.leave_pinned) {
@@ -495,12 +493,12 @@ int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t *mpool, void *addr,
495493
} else {
496494
mpool_rgpusm->stat_cache_notfound++;
497495
}
498-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
496+
opal_mutex_unlock (&mpool->rcache->lock);
499497

500498
return rc;
501499
}
502500

503-
static inline bool registration_is_cachebale(mca_mpool_base_registration_t *reg)
501+
static inline bool registration_is_cacheable(mca_mpool_base_registration_t *reg)
504502
{
505503
return !(reg->flags &
506504
(MCA_MPOOL_FLAGS_CACHE_BYPASS |
@@ -514,14 +512,14 @@ int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool,
514512
int rc = OPAL_SUCCESS;
515513
assert(reg->ref_count > 0);
516514

517-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
515+
opal_mutex_lock (&mpool->rcache->lock);
518516
reg->ref_count--;
519517
opal_output(-1, "Deregister: reg->ref_count=%d", (int)reg->ref_count);
520518
if(reg->ref_count > 0) {
521-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
519+
opal_mutex_unlock (&mpool->rcache->lock);
522520
return OPAL_SUCCESS;
523521
}
524-
if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cachebale(reg))
522+
if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cacheable(reg))
525523
{
526524
/* if leave_pinned is set don't deregister memory, but put it
527525
* on LRU list for future use */
@@ -535,7 +533,7 @@ int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool,
535533
mpool->rcache->rcache_delete(mpool->rcache, reg);
536534

537535
/* Drop the rcache lock before deregistring the memory */
538-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
536+
opal_mutex_unlock (&mpool->rcache->lock);
539537

540538
{
541539
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *)mpool;
@@ -545,17 +543,14 @@ int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool,
545543
reg);
546544
}
547545

548-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
546+
opal_mutex_lock (&mpool->rcache->lock);
549547

550548
if(OPAL_SUCCESS == rc) {
551549
opal_free_list_return (&mpool_rgpusm->reg_list,
552550
(opal_free_list_item_t*)reg);
553551
}
554552
}
555-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
556-
557-
/* Cleanup any vmas that we have deferred deletion on */
558-
mpool->rcache->rcache_clean(mpool->rcache);
553+
opal_mutex_unlock (&mpool->rcache->lock);
559554

560555
return rc;
561556
}
@@ -572,7 +567,7 @@ int mca_mpool_rgpusm_deregister_no_lock(struct mca_mpool_base_module_t *mpool,
572567
if(reg->ref_count > 0) {
573568
return OPAL_SUCCESS;
574569
}
575-
if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cachebale(reg))
570+
if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cacheable(reg))
576571
{
577572
/* if leave_pinned is set don't deregister memory, but put it
578573
* on LRU list for future use */
@@ -599,15 +594,28 @@ int mca_mpool_rgpusm_deregister_no_lock(struct mca_mpool_base_module_t *mpool,
599594
return rc;
600595
}
601596

602-
#define RGPUSM_MPOOL_NREGS 100
597+
static int iterate_dereg_finalize (mca_mpool_base_registration_t *rgpusm_reg, void *ctx)
598+
{
599+
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *) ctx;
600+
601+
if ((mca_mpool_base_module_t *) mpool_rgpusm != rgpusm_reg->mpool) {
602+
return 0;
603+
}
604+
605+
if (registration_is_cacheable (rgpusm_reg)) {
606+
opal_list_remove_item (&mpool_rgpusm->lru_list, (opal_list_item_t *) rgpusm_reg);
607+
}
608+
609+
/* set the reference count to 0 otherwise dereg will fail on assert */
610+
rgpusm_reg->ref_count = 0;
611+
(void) mpool_rgpusm->resources.deregister_mem (mpool_rgpusm->resources.reg_data, rgpusm_reg);
612+
613+
return 0;
614+
}
603615

604616
void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool)
605617
{
606618
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
607-
mca_mpool_base_registration_t *reg;
608-
mca_mpool_base_registration_t *regs[RGPUSM_MPOOL_NREGS];
609-
int reg_cnt, i;
610-
int rc;
611619

612620
/* Statistic */
613621
if(true == mca_mpool_rgpusm_component.print_stats) {
@@ -619,49 +627,11 @@ void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool)
619627
mpool_rgpusm->stat_evicted);
620628
}
621629

622-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
623-
do {
624-
reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1,
625-
regs, RGPUSM_MPOOL_NREGS);
626-
opal_output(-1, "Registration size at finalize = %d", reg_cnt);
627-
628-
for(i = 0; i < reg_cnt; i++) {
629-
reg = regs[i];
630630

631-
if(reg->ref_count) {
632-
reg->ref_count = 0; /* otherway dereg will fail on assert */
633-
} else if (mca_mpool_rgpusm_component.leave_pinned) {
634-
opal_list_remove_item(&mpool_rgpusm->lru_list,
635-
(opal_list_item_t*)reg);
636-
}
637-
638-
/* Remove from rcache first */
639-
mpool->rcache->rcache_delete(mpool->rcache, reg);
640-
641-
/* Drop lock before deregistering memory */
642-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
643-
assert(reg->ref_count == 0);
644-
rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
645-
reg);
646-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
647-
648-
if(rc != OPAL_SUCCESS) {
649-
/* Potentially lose track of registrations
650-
do we have to put it back? */
651-
continue;
652-
}
653-
654-
opal_free_list_return (&mpool_rgpusm->reg_list,
655-
(opal_free_list_item_t *) reg);
656-
}
657-
} while(reg_cnt == RGPUSM_MPOOL_NREGS);
631+
(void) mpool->rcache->rcache_iterate (mpool->rcache, NULL, (size_t) -1,
632+
iterate_dereg_finalize, (void *) mpool);
658633

659634
OBJ_DESTRUCT(&mpool_rgpusm->lru_list);
660635
OBJ_DESTRUCT(&mpool_rgpusm->reg_list);
661-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
662-
663-
/* Cleanup any vmas that we have deferred deletion on */
664-
mpool->rcache->rcache_clean(mpool->rcache);
665-
666636
}
667637

0 commit comments

Comments
 (0)