Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit fb636c7

Browse files
committed
Merge pull request #1218 from hjelmn/v2.x_rcache
mpool/rgpusm: update for rcache threading fixes
2 parents 8ce09f1 + 3e4ff53 commit fb636c7

File tree

1 file changed

+39
-69
lines changed

1 file changed

+39
-69
lines changed

opal/mca/mpool/rgpusm/mpool_rgpusm_module.c

Lines changed: 39 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
1616
* Copyright (c) 2010 IBM Corporation. All rights reserved.
1717
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
18-
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
18+
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
1919
* reserved.
2020
*
2121
* $COPYRIGHT$
@@ -113,11 +113,11 @@ static inline bool mca_mpool_rgpusm_deregister_lru (mca_mpool_base_module_t *mpo
113113
mpool->rcache->rcache_delete(mpool->rcache, old_reg);
114114

115115
/* Drop the rcache lock while we deregister the memory */
116-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
116+
opal_mutex_unlock (&mpool->rcache->lock);
117117
assert(old_reg->ref_count == 0);
118118
rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
119119
old_reg);
120-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
120+
opal_mutex_lock (&mpool->rcache->lock);
121121

122122
/* This introduces a potential leak of registrations if
123123
the deregistration fails to occur as we no longer have
@@ -242,7 +242,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
242242
}
243243

244244
/* Check to see if memory is registered and stored in the cache. */
245-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
245+
opal_mutex_lock (&mpool->rcache->lock);
246246
mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
247247

248248
/* If *reg is not NULL, we have a registration. Let us see if the
@@ -306,7 +306,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
306306
(opal_list_item_t*)(*reg));
307307
}
308308
(*reg)->ref_count++;
309-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
309+
opal_mutex_unlock (&mpool->rcache->lock);
310310
opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count);
311311
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
312312
"RGPUSM: Found entry in cache addr=%p, size=%d", addr, (int)size);
@@ -322,7 +322,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
322322

323323
item = opal_free_list_get (&mpool_rgpusm->reg_list);
324324
if(NULL == item) {
325-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
325+
opal_mutex_unlock (&mpool->rcache->lock);
326326
return OPAL_ERR_OUT_OF_RESOURCE;
327327
}
328328
rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item;
@@ -399,7 +399,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
399399
}
400400

401401
if(rc != OPAL_SUCCESS) {
402-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
402+
opal_mutex_unlock (&mpool->rcache->lock);
403403
opal_free_list_return (&mpool_rgpusm->reg_list, item);
404404
return rc;
405405
}
@@ -439,7 +439,7 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
439439
}
440440

441441
if(rc != OPAL_SUCCESS) {
442-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
442+
opal_mutex_unlock (&mpool->rcache->lock);
443443
opal_free_list_return (&mpool_rgpusm->reg_list, item);
444444
/* We cannot recover from this. We can be here if the size of
445445
* the cache is smaller than the amount of memory we are
@@ -454,10 +454,8 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr,
454454

455455
rgpusm_reg->base.ref_count++;
456456
*reg = (mca_mpool_base_registration_t *)rgpusm_reg;
457-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
457+
opal_mutex_unlock (&mpool->rcache->lock);
458458

459-
/* Cleanup any vmas that we have deferred deletion on */
460-
mpool->rcache->rcache_clean(mpool->rcache);
461459
return OPAL_SUCCESS;
462460
}
463461

@@ -483,7 +481,7 @@ int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t *mpool, void *addr,
483481
base = addr;
484482
bound = base + size - 1; /* To keep cache hits working correctly */
485483

486-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
484+
opal_mutex_lock (&mpool->rcache->lock);
487485
opal_output(-1, "Looking for addr=%p, size=%d", addr, (int)size);
488486
rc = mpool->rcache->rcache_find(mpool->rcache, addr, size, reg);
489487
if(*reg != NULL && mca_mpool_rgpusm_component.leave_pinned) {
@@ -495,12 +493,12 @@ int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t *mpool, void *addr,
495493
} else {
496494
mpool_rgpusm->stat_cache_notfound++;
497495
}
498-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
496+
opal_mutex_unlock (&mpool->rcache->lock);
499497

500498
return rc;
501499
}
502500

503-
static inline bool registration_is_cachebale(mca_mpool_base_registration_t *reg)
501+
static inline bool registration_is_cacheable(mca_mpool_base_registration_t *reg)
504502
{
505503
return !(reg->flags &
506504
(MCA_MPOOL_FLAGS_CACHE_BYPASS |
@@ -514,14 +512,14 @@ int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool,
514512
int rc = OPAL_SUCCESS;
515513
assert(reg->ref_count > 0);
516514

517-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
515+
opal_mutex_lock (&mpool->rcache->lock);
518516
reg->ref_count--;
519517
opal_output(-1, "Deregister: reg->ref_count=%d", (int)reg->ref_count);
520518
if(reg->ref_count > 0) {
521-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
519+
opal_mutex_unlock (&mpool->rcache->lock);
522520
return OPAL_SUCCESS;
523521
}
524-
if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cachebale(reg))
522+
if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cacheable(reg))
525523
{
526524
/* if leave_pinned is set don't deregister memory, but put it
527525
* on LRU list for future use */
@@ -535,7 +533,7 @@ int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool,
535533
mpool->rcache->rcache_delete(mpool->rcache, reg);
536534

537535
/* Drop the rcache lock before deregistring the memory */
538-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
536+
opal_mutex_unlock (&mpool->rcache->lock);
539537

540538
{
541539
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *)mpool;
@@ -545,17 +543,14 @@ int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool,
545543
reg);
546544
}
547545

548-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
546+
opal_mutex_lock (&mpool->rcache->lock);
549547

550548
if(OPAL_SUCCESS == rc) {
551549
opal_free_list_return (&mpool_rgpusm->reg_list,
552550
(opal_free_list_item_t*)reg);
553551
}
554552
}
555-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
556-
557-
/* Cleanup any vmas that we have deferred deletion on */
558-
mpool->rcache->rcache_clean(mpool->rcache);
553+
opal_mutex_unlock (&mpool->rcache->lock);
559554

560555
return rc;
561556
}
@@ -572,7 +567,7 @@ int mca_mpool_rgpusm_deregister_no_lock(struct mca_mpool_base_module_t *mpool,
572567
if(reg->ref_count > 0) {
573568
return OPAL_SUCCESS;
574569
}
575-
if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cachebale(reg))
570+
if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cacheable(reg))
576571
{
577572
/* if leave_pinned is set don't deregister memory, but put it
578573
* on LRU list for future use */
@@ -599,15 +594,28 @@ int mca_mpool_rgpusm_deregister_no_lock(struct mca_mpool_base_module_t *mpool,
599594
return rc;
600595
}
601596

602-
#define RGPUSM_MPOOL_NREGS 100
597+
static int iterate_dereg_finalize (mca_mpool_base_registration_t *rgpusm_reg, void *ctx)
598+
{
599+
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *) ctx;
600+
601+
if ((mca_mpool_base_module_t *) mpool_rgpusm != rgpusm_reg->mpool) {
602+
return 0;
603+
}
604+
605+
if (registration_is_cacheable (rgpusm_reg)) {
606+
opal_list_remove_item (&mpool_rgpusm->lru_list, (opal_list_item_t *) rgpusm_reg);
607+
}
608+
609+
/* set the reference count to 0 otherwise dereg will fail on assert */
610+
rgpusm_reg->ref_count = 0;
611+
(void) mpool_rgpusm->resources.deregister_mem (mpool_rgpusm->resources.reg_data, rgpusm_reg);
612+
613+
return 0;
614+
}
603615

604616
void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool)
605617
{
606618
mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool;
607-
mca_mpool_base_registration_t *reg;
608-
mca_mpool_base_registration_t *regs[RGPUSM_MPOOL_NREGS];
609-
int reg_cnt, i;
610-
int rc;
611619

612620
/* Statistic */
613621
if(true == mca_mpool_rgpusm_component.print_stats) {
@@ -619,49 +627,11 @@ void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool)
619627
mpool_rgpusm->stat_evicted);
620628
}
621629

622-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
623-
do {
624-
reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1,
625-
regs, RGPUSM_MPOOL_NREGS);
626-
opal_output(-1, "Registration size at finalize = %d", reg_cnt);
627-
628-
for(i = 0; i < reg_cnt; i++) {
629-
reg = regs[i];
630630

631-
if(reg->ref_count) {
632-
reg->ref_count = 0; /* otherway dereg will fail on assert */
633-
} else if (mca_mpool_rgpusm_component.leave_pinned) {
634-
opal_list_remove_item(&mpool_rgpusm->lru_list,
635-
(opal_list_item_t*)reg);
636-
}
637-
638-
/* Remove from rcache first */
639-
mpool->rcache->rcache_delete(mpool->rcache, reg);
640-
641-
/* Drop lock before deregistering memory */
642-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
643-
assert(reg->ref_count == 0);
644-
rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data,
645-
reg);
646-
OPAL_THREAD_LOCK(&mpool->rcache->lock);
647-
648-
if(rc != OPAL_SUCCESS) {
649-
/* Potentially lose track of registrations
650-
do we have to put it back? */
651-
continue;
652-
}
653-
654-
opal_free_list_return (&mpool_rgpusm->reg_list,
655-
(opal_free_list_item_t *) reg);
656-
}
657-
} while(reg_cnt == RGPUSM_MPOOL_NREGS);
631+
(void) mpool->rcache->rcache_iterate (mpool->rcache, NULL, (size_t) -1,
632+
iterate_dereg_finalize, (void *) mpool);
658633

659634
OBJ_DESTRUCT(&mpool_rgpusm->lru_list);
660635
OBJ_DESTRUCT(&mpool_rgpusm->reg_list);
661-
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
662-
663-
/* Cleanup any vmas that we have deferred deletion on */
664-
mpool->rcache->rcache_clean(mpool->rcache);
665-
666636
}
667637

0 commit comments

Comments
 (0)