Skip to content

Commit aa5ddfe

Browse files
author
Rolf vandeVaart
committed
Add ability for user to empty the CUDA IPC registration cache when it is full
(cherry picked from commit open-mpi/ompi@7da614c)
1 parent 97ba09f commit aa5ddfe

File tree

3 files changed

+42
-9
lines changed

3 files changed

+42
-9
lines changed

opal/mca/mpool/rgpusm/mpool_rgpusm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2006 Voltaire. All rights reserved.
14-
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
14+
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
1515
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
*
@@ -41,6 +41,7 @@ struct mca_mpool_rgpusm_component_t {
4141
bool print_stats;
4242
int leave_pinned;
4343
int output;
44+
bool empty_cache;
4445
};
4546
typedef struct mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component_t;
4647

opal/mca/mpool/rgpusm/mpool_rgpusm_component.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2006 Voltaire. All rights reserved.
1414
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
15-
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
15+
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
1616
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1717
* reserved.
1818
*
@@ -127,6 +127,15 @@ static int rgpusm_register(void)
127127
MCA_BASE_VAR_SCOPE_READONLY,
128128
&opal_mpool_rgpusm_verbose);
129129

130+
/* Force emptying of entire registration cache when it gets full */
131+
mca_mpool_rgpusm_component.empty_cache = false;
132+
(void) mca_base_component_var_register(&mca_mpool_rgpusm_component.super.mpool_version,
133+
"empty_cache", "When set, empty entire registration cache when it is full",
134+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
135+
OPAL_INFO_LVL_5,
136+
MCA_BASE_VAR_SCOPE_READONLY,
137+
&mca_mpool_rgpusm_component.empty_cache);
138+
130139
return OPAL_SUCCESS;
131140
}
132141

opal/mca/mpool/rgpusm/mpool_rgpusm_module.c

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2006 Voltaire. All rights reserved.
1515
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
1616
* Copyright (c) 2010 IBM Corporation. All rights reserved.
17-
* Copyright (c) 2012-2014 NVIDIA Corporation. All rights reserved.
17+
* Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved.
1818
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1919
* reserved.
2020
*
@@ -406,12 +406,35 @@ int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr,
406406

407407
opal_output_verbose(80, mca_mpool_rgpusm_component.output,
408408
"RGPUSM: About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size);
409-
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
410-
mca_mpool_rgpusm_component.rcache_size_limit)) ==
411-
OPAL_ERR_TEMP_OUT_OF_RESOURCE) {
412-
opal_output(-1, "No room in the cache - boot one out");
413-
if (!mca_mpool_rgpusm_deregister_lru(mpool)) {
414-
break;
409+
rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
410+
mca_mpool_rgpusm_component.rcache_size_limit);
411+
if (OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc) {
412+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
413+
"RGPUSM: No room in the cache - boot the first one out");
414+
(void)mca_mpool_rgpusm_deregister_lru(mpool);
415+
if (mca_mpool_rgpusm_component.empty_cache) {
416+
int remNum = 1;
417+
/* Empty out every registration from LRU until it is empty */
418+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
419+
"RGPUSM: About to delete all the unused entries in the cache");
420+
while (mca_mpool_rgpusm_deregister_lru(mpool)) {
421+
remNum++;
422+
}
423+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
424+
"RGPUSM: Deleted and deregistered %d entries", remNum);
425+
rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
426+
mca_mpool_rgpusm_component.rcache_size_limit);
427+
} else {
428+
/* Check for room after one removal. If not, remove another one until there is space */
429+
while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg,
430+
mca_mpool_rgpusm_component.rcache_size_limit)) ==
431+
OPAL_ERR_TEMP_OUT_OF_RESOURCE) {
432+
opal_output_verbose(40, mca_mpool_rgpusm_component.output,
433+
"RGPUSM: No room in the cache - boot one out");
434+
if (!mca_mpool_rgpusm_deregister_lru(mpool)) {
435+
break;
436+
}
437+
}
415438
}
416439
}
417440

0 commit comments

Comments
 (0)