Skip to content

Commit e843ebf

Browse files
author
rhc54
committed
Merge pull request open-mpi#781 from rolfv/pr/cuda-memlimit-fix
Add the ability to empty the rgpusm cache when full if the user requests it
2 parents e605566 + a7d2b07 commit e843ebf

File tree

3 files changed

+41
-8
lines changed

3 files changed

+41
-8
lines changed

ompi/mca/mpool/rgpusm/mpool_rgpusm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2006 Voltaire. All rights reserved.
13-
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
13+
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
1414
*
1515
* $COPYRIGHT$
1616
*
@@ -38,6 +38,7 @@ struct mca_mpool_rgpusm_component_t {
3838
bool print_stats;
3939
int leave_pinned;
4040
int output;
41+
bool empty_cache;
4142
};
4243
typedef struct mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component_t;
4344

ompi/mca/mpool/rgpusm/mpool_rgpusm_component.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006 Voltaire. All rights reserved.
1313
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
14-
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
14+
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
1515
*
1616
* $COPYRIGHT$
1717
*
@@ -126,6 +126,15 @@ static int rgpusm_register(void)
126126
MCA_BASE_VAR_SCOPE_READONLY,
127127
&ompi_mpool_rgpusm_verbose);
128128

129+
/* Force emptying of entire registration cache when it gets full */
130+
mca_mpool_rgpusm_component.empty_cache = false;
131+
(void) mca_base_component_var_register(&mca_mpool_rgpusm_component.super.mpool_version,
132+
"empty_cache", "When set, empty entire registration cache when it is full",
133+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
134+
OPAL_INFO_LVL_5,
135+
MCA_BASE_VAR_SCOPE_READONLY,
136+
&mca_mpool_rgpusm_component.empty_cache);
137+
129138
return OMPI_SUCCESS;
130139
}
131140

ompi/mca/mpool/rgpusm/mpool_rgpusm_module.c

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -412,12 +412,35 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
412412
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
413413
"RGPUSM: About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
414414
SET_PAGE_ALIGNMENT_TO_ZERO();
415-
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
416-
mca_mpool_rgpusm_component.rcache_size_limit)) ==
417-
OMPI_ERR_TEMP_OUT_OF_RESOURCE) {
418-
opal_output(-1, "No room in the cache - boot one out");
419-
if (!mca_mpool_rgpusm_deregister_lru(mpool)) {
420-
break;
415+
rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
416+
mca_mpool_rgpusm_component.rcache_size_limit);
417+
if (OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc) {
418+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
419+
"RGPUSM: No room in the cache - boot the first one out");
420+
(void)mca_mpool_rgpusm_deregister_lru(mpool);
421+
if (mca_mpool_rgpusm_component.empty_cache) {
422+
int remNum = 1;
423+
/* Empty out every registration from LRU until it is empty */
424+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
425+
"RGPUSM: About to delete all the unused entries in the cache");
426+
while (mca_mpool_rgpusm_deregister_lru(mpool)) {
427+
remNum++;
428+
}
429+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
430+
"RGPUSM: Deleted and deregistered %d entries", remNum);
431+
rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
432+
mca_mpool_rgpusm_component.rcache_size_limit);
433+
} else {
434+
/* Check for room after one removal. If not, remove another one until there is space */
435+
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
436+
mca_mpool_rgpusm_component.rcache_size_limit)) ==
437+
OPAL_ERR_TEMP_OUT_OF_RESOURCE) {
438+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
439+
"RGPUSM: No room in the cache - boot one out");
440+
if (!mca_mpool_rgpusm_deregister_lru(mpool)) {
441+
break;
442+
}
443+
}
421444
}
422445
}
423446
RESTORE_PAGE_ALIGNMENT();

0 commit comments

Comments
 (0)