Skip to content

Commit c2528da

Browse files
committed
Merge pull request open-mpi#595 from rolfv/pr/add-cache-empty-code-2.x
Add ability for user to empty the CUDA IPC registration cache when it is full
2 parents 97ba09f + aa5ddfe commit c2528da

File tree

3 files changed

+42
-9
lines changed

3 files changed

+42
-9
lines changed

opal/mca/mpool/rgpusm/mpool_rgpusm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2006 Voltaire. All rights reserved.
14-
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
14+
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
1515
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
*
@@ -41,6 +41,7 @@ struct mca_mpool_rgpusm_component_t {
4141
bool print_stats;
4242
int leave_pinned;
4343
int output;
44+
bool empty_cache;
4445
};
4546
typedef struct mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component_t;
4647

opal/mca/mpool/rgpusm/mpool_rgpusm_component.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2006 Voltaire. All rights reserved.
1414
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
15-
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
15+
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
1616
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1717
* reserved.
1818
*
@@ -127,6 +127,15 @@ static int rgpusm_register(void)
127127
MCA_BASE_VAR_SCOPE_READONLY,
128128
&opal_mpool_rgpusm_verbose);
129129

130+
/* Force emptying of entire registration cache when it gets full */
131+
mca_mpool_rgpusm_component.empty_cache = false;
132+
(void) mca_base_component_var_register(&mca_mpool_rgpusm_component.super.mpool_version,
133+
"empty_cache", "When set, empty entire registration cache when it is full",
134+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
135+
OPAL_INFO_LVL_5,
136+
MCA_BASE_VAR_SCOPE_READONLY,
137+
&mca_mpool_rgpusm_component.empty_cache);
138+
130139
return OPAL_SUCCESS;
131140
}
132141

opal/mca/mpool/rgpusm/mpool_rgpusm_module.c

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2006 Voltaire. All rights reserved.
1515
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
1616
* Copyright (c) 2010 IBM Corporation. All rights reserved.
17-
* Copyright (c) 2012-2014 NVIDIA Corporation. All rights reserved.
17+
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
1818
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1919
* reserved.
2020
*
@@ -406,12 +406,35 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
406406

407407
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
408408
"RGPUSM: About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
409-
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
410-
mca_mpool_rgpusm_component.rcache_size_limit)) ==
411-
OPAL_ERR_TEMP_OUT_OF_RESOURCE) {
412-
opal_output(-1, "No room in the cache - boot one out");
413-
if (!mca_mpool_rgpusm_deregister_lru(mpool)) {
414-
break;
409+
rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
410+
mca_mpool_rgpusm_component.rcache_size_limit);
411+
if (OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc) {
412+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
413+
"RGPUSM: No room in the cache - boot the first one out");
414+
(void)mca_mpool_rgpusm_deregister_lru(mpool);
415+
if (mca_mpool_rgpusm_component.empty_cache) {
416+
int remNum = 1;
417+
/* Empty out every registration from LRU until it is empty */
418+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
419+
"RGPUSM: About to delete all the unused entries in the cache");
420+
while (mca_mpool_rgpusm_deregister_lru(mpool)) {
421+
remNum++;
422+
}
423+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
424+
"RGPUSM: Deleted and deregistered %d entries", remNum);
425+
rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
426+
mca_mpool_rgpusm_component.rcache_size_limit);
427+
} else {
428+
/* Check for room after one removal. If not, remove another one until there is space */
429+
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
430+
mca_mpool_rgpusm_component.rcache_size_limit)) ==
431+
OPAL_ERR_TEMP_OUT_OF_RESOURCE) {
432+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
433+
"RGPUSM: No room in the cache - boot one out");
434+
if (!mca_mpool_rgpusm_deregister_lru(mpool)) {
435+
break;
436+
}
437+
}
415438
}
416439
}
417440

0 commit comments

Comments
 (0)