Skip to content

Commit a652a19

Browse files
committed
btl/vader: reduce memory footprint when using xpmem
The vader btl kept a per-peer registration cache to keep track of attachments. This is not really a problem with small numbers of local ranks but can be a problem with large SMP machines. To reduce the footprint there is now one registration cache for all xpmem attachments. This will probably increase the lookup time for large transfers but is a worthwhile trade-off. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 8cc3f28 commit a652a19

File tree

5 files changed

+112
-84
lines changed

5 files changed

+112
-84
lines changed

opal/mca/btl/vader/btl_vader.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
#include "opal/sys/atomic.h"
4949
#include "opal/mca/btl/btl.h"
5050
#include "opal/mca/rcache/rcache.h"
51+
#include "opal/mca/rcache/base/rcache_base_vma.h"
5152
#include "opal/mca/btl/base/base.h"
5253
#include "opal/mca/rcache/rcache.h"
5354
#include "opal/mca/rcache/base/base.h"
@@ -103,6 +104,7 @@ struct mca_btl_vader_component_t {
103104
int vader_free_list_inc; /**< number of elements to alloc when growing free lists */
104105
#if OPAL_BTL_VADER_HAVE_XPMEM
105106
xpmem_segid_t my_seg_id; /**< this rank's xpmem segment id */
107+
mca_rcache_base_vma_module_t *vma_module; /**< registration cache for xpmem segments */
106108
#endif
107109
opal_shmem_ds_t seg_ds; /**< this rank's shared memory segment (when not using xpmem) */
108110

opal/mca/btl/vader/btl_vader_endpoint.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
#include "opal_config.h"
3030
#include "btl_vader_xpmem.h"
31-
#include "opal/mca/rcache/base/rcache_base_vma.h"
3231

3332
#define MCA_BTL_VADER_FBOX_ALIGNMENT 32
3433
#define MCA_BTL_VADER_FBOX_ALIGNMENT_MASK (MCA_BTL_VADER_FBOX_ALIGNMENT - 1)
@@ -75,7 +74,6 @@ typedef struct mca_btl_base_endpoint_t {
7574
union {
7675
#if OPAL_BTL_VADER_HAVE_XPMEM
7776
struct {
78-
mca_rcache_base_vma_module_t *vma_module;
7977
xpmem_apid_t apid; /**< xpmem apid for remote peer */
8078
} xpmem;
8179
#endif

opal/mca/btl/vader/btl_vader_module.c

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,12 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
145145
/* set flag indicating btl has been inited */
146146
vader_btl->btl_inited = true;
147147

148+
#if OPAL_BTL_VADER_HAVE_XPMEM
149+
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
150+
mca_btl_vader_component.vma_module = mca_rcache_base_vma_module_alloc ();
151+
}
152+
#endif
153+
148154
return OPAL_SUCCESS;
149155
}
150156

@@ -171,7 +177,6 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
171177
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
172178
/* always use xpmem if it is available */
173179
ep->segment_data.xpmem.apid = xpmem_get (modex->xpmem.seg_id, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666);
174-
ep->segment_data.xpmem.vma_module = mca_rcache_base_vma_module_alloc ();
175180
(void) vader_get_registation (ep, modex->xpmem.segment_base, mca_btl_vader_component.segment_size,
176181
MCA_RCACHE_FLAGS_PERSIST, (void **) &ep->segment_base);
177182
} else {
@@ -354,6 +359,12 @@ static int vader_finalize(struct mca_btl_base_module_t *btl)
354359
opal_shmem_segment_detach (&mca_btl_vader_component.seg_ds);
355360
}
356361

362+
#if OPAL_BTL_VADER_HAVE_XPMEM
363+
if (NULL != mca_btl_vader_component.vma_module) {
364+
OBJ_RELEASE(mca_btl_vader_component.vma_module);
365+
}
366+
#endif
367+
357368
return OPAL_SUCCESS;
358369
}
359370

@@ -540,14 +551,6 @@ static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep)
540551
}
541552

542553
#if OPAL_BTL_VADER_HAVE_XPMEM
543-
static int mca_btl_vader_endpoint_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
544-
{
545-
mca_rcache_base_vma_module_t *vma_module = (mca_rcache_base_vma_module_t *) ctx;
546-
/* otherwise dereg will fail on assert */
547-
reg->ref_count = 0;
548-
(void) mca_rcache_base_vma_delete (vma_module, reg);
549-
return OPAL_SUCCESS;
550-
}
551554
#endif
552555

553556
static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
@@ -557,19 +560,7 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
557560

558561
#if OPAL_BTL_VADER_HAVE_XPMEM
559562
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
560-
if (ep->segment_data.xpmem.vma_module) {
561-
/* clean out the registration cache */
562-
(void) mca_rcache_base_vma_iterate (ep->segment_data.xpmem.vma_module,
563-
NULL, (size_t) -1,
564-
mca_btl_vader_endpoint_rcache_cleanup,
565-
(void *) ep->segment_data.xpmem.vma_module);
566-
OBJ_RELEASE(ep->segment_data.xpmem.vma_module);
567-
}
568-
569-
if (ep->segment_base) {
570-
xpmem_release (ep->segment_data.xpmem.apid);
571-
ep->segment_data.xpmem.apid = 0;
572-
}
563+
mca_btl_vader_xpmem_cleanup_endpoint (ep);
573564
} else
574565
#endif
575566
if (ep->segment_data.other.seg_ds) {

opal/mca/btl/vader/btl_vader_xpmem.c

Lines changed: 95 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -32,118 +32,153 @@ int mca_btl_vader_xpmem_init (void)
3232
return OPAL_SUCCESS;
3333
}
3434

35+
struct vader_check_reg_ctx_t {
36+
mca_rcache_base_vma_module_t *vma_module;
37+
mca_btl_base_endpoint_t *ep;
38+
mca_rcache_base_registration_t **reg;
39+
uintptr_t base;
40+
uintptr_t bound;
41+
};
42+
typedef struct vader_check_reg_ctx_t vader_check_reg_ctx_t;
43+
44+
static int vader_check_reg (mca_rcache_base_registration_t *reg, void *ctx)
45+
{
46+
vader_check_reg_ctx_t *vader_ctx = (vader_check_reg_ctx_t *) ctx;
47+
48+
if ((intptr_t) reg->alloc_base != vader_ctx->ep->peer_smp_rank ||
49+
(reg->flags & MCA_RCACHE_FLAGS_PERSIST)) {
50+
/* ignore this registration */
51+
return OPAL_SUCCESS;
52+
}
53+
54+
vader_ctx->reg[0] = reg;
55+
56+
if (vader_ctx->bound <= (uintptr_t) reg->bound && vader_ctx->base >= (uintptr_t) reg->base) {
57+
(void)opal_atomic_add (&reg->ref_count, 1);
58+
return 1;
59+
}
60+
61+
/* remove this pointer from the rcache and decrement its reference count
62+
(so it is detached later) */
63+
mca_rcache_base_vma_delete (vader_ctx->vma_module, reg);
64+
65+
return 2;
66+
}
67+
3568
/* look up the remote pointer in the peer rcache and attach if
3669
* necessary */
3770
mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *ep, void *rem_ptr,
3871
size_t size, int flags, void **local_ptr)
3972
{
40-
mca_rcache_base_vma_module_t *vma_module = ep->segment_data.xpmem.vma_module;
41-
mca_rcache_base_registration_t *regs[10], *reg = NULL;
73+
mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
74+
uint64_t attach_align = 1 << mca_btl_vader_component.log_attach_align;
75+
mca_rcache_base_registration_t *reg = NULL;
76+
vader_check_reg_ctx_t check_ctx = {.ep = ep, .reg = &reg, .vma_module = vma_module};
4277
xpmem_addr_t xpmem_addr;
4378
uintptr_t base, bound;
44-
uint64_t attach_align = 1 << mca_btl_vader_component.log_attach_align;
4579
int rc, i;
4680

47-
/* protect rcache access */
48-
OPAL_THREAD_LOCK(&ep->lock);
49-
50-
/* use btl/self for self communication */
51-
assert (ep->peer_smp_rank != MCA_BTL_VADER_LOCAL_RANK);
52-
5381
base = OPAL_DOWN_ALIGN((uintptr_t) rem_ptr, attach_align, uintptr_t);
5482
bound = OPAL_ALIGN((uintptr_t) rem_ptr + size - 1, attach_align, uintptr_t) + 1;
5583
if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
5684
bound = VADER_MAX_ADDRESS;
5785
}
5886

59-
/* several segments may match the base pointer */
60-
rc = mca_rcache_base_vma_find_all (vma_module, (void *) base, bound - base, regs, 10);
61-
for (i = 0 ; i < rc ; ++i) {
62-
if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) {
63-
(void)opal_atomic_add (&regs[i]->ref_count, 1);
64-
reg = regs[i];
65-
goto reg_found;
66-
}
67-
68-
if (regs[i]->flags & MCA_RCACHE_FLAGS_PERSIST) {
69-
continue;
70-
}
71-
72-
/* remove this pointer from the rcache and decrement its reference count
73-
(so it is detached later) */
74-
rc = mca_rcache_base_vma_delete (vma_module, regs[i]);
75-
if (OPAL_UNLIKELY(0 != rc)) {
76-
/* someone beat us to it? */
77-
break;
78-
}
87+
check_ctx.base = base;
88+
check_ctx.bound = bound;
7989

90+
/* several segments may match the base pointer */
91+
rc = mca_rcache_base_vma_iterate (vma_module, (void *) base, bound - base, vader_check_reg, &check_ctx);
92+
if (2 == rc) {
8093
/* start the new segment from the lower of the two bases */
81-
base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;
82-
83-
(void)opal_atomic_add (&regs[i]->ref_count, -1);
94+
base = (uintptr_t) reg->base < base ? (uintptr_t) reg->base : base;
8495

85-
if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
96+
if (OPAL_LIKELY(0 == opal_atomic_add_32 (&reg->ref_count, -1))) {
8697
/* this pointer is not in use */
87-
(void) xpmem_detach (regs[i]->rcache_context);
88-
OBJ_RELEASE(regs[i]);
98+
(void) xpmem_detach (reg->rcache_context);
99+
OBJ_RELEASE(reg);
89100
}
90101

91-
break;
102+
reg = NULL;
92103
}
93104

94-
reg = OBJ_NEW(mca_rcache_base_registration_t);
95-
if (OPAL_LIKELY(NULL != reg)) {
96-
/* stick around for awhile */
97-
reg->ref_count = 2;
98-
reg->base = (unsigned char *) base;
99-
reg->bound = (unsigned char *) bound;
100-
reg->flags = flags;
105+
if (NULL == reg) {
106+
reg = OBJ_NEW(mca_rcache_base_registration_t);
107+
if (OPAL_LIKELY(NULL != reg)) {
108+
/* stick around for awhile */
109+
reg->ref_count = 2;
110+
reg->base = (unsigned char *) base;
111+
reg->bound = (unsigned char *) bound;
112+
reg->flags = flags;
113+
reg->alloc_base = (void *) (intptr_t) ep->peer_smp_rank;
101114

102115
#if defined(HAVE_SN_XPMEM_H)
103-
xpmem_addr.id = ep->segment_data.xpmem.apid;
116+
xpmem_addr.id = ep->segment_data.xpmem.apid;
104117
#else
105-
xpmem_addr.apid = ep->segment_data.xpmem.apid;
118+
xpmem_addr.apid = ep->segment_data.xpmem.apid;
106119
#endif
107-
xpmem_addr.offset = base;
120+
xpmem_addr.offset = base;
108121

109-
reg->rcache_context = xpmem_attach (xpmem_addr, bound - base, NULL);
110-
if (OPAL_UNLIKELY((void *)-1 == reg->rcache_context)) {
111-
OPAL_THREAD_UNLOCK(&ep->lock);
112-
OBJ_RELEASE(reg);
113-
return NULL;
114-
}
122+
reg->rcache_context = xpmem_attach (xpmem_addr, bound - base, NULL);
123+
if (OPAL_UNLIKELY((void *)-1 == reg->rcache_context)) {
124+
OBJ_RELEASE(reg);
125+
return NULL;
126+
}
115127

116-
opal_memchecker_base_mem_defined (reg->rcache_context, bound - base);
128+
opal_memchecker_base_mem_defined (reg->rcache_context, bound - base);
117129

118-
mca_rcache_base_vma_insert (vma_module, reg, 0);
130+
mca_rcache_base_vma_insert (vma_module, reg, 0);
131+
}
119132
}
120133

121-
reg_found:
122134
opal_atomic_wmb ();
123135
*local_ptr = (void *) ((uintptr_t) reg->rcache_context +
124136
(ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));
125137

126-
OPAL_THREAD_UNLOCK(&ep->lock);
127-
128138
return reg;
129139
}
130140

131141
void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *ep)
132142
{
133-
mca_rcache_base_vma_module_t *vma_module = ep->segment_data.xpmem.vma_module;
143+
mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
134144
int32_t ref_count;
135145

136146
ref_count = opal_atomic_add_32 (&reg->ref_count, -1);
137147
if (OPAL_UNLIKELY(0 == ref_count && !(reg->flags & MCA_RCACHE_FLAGS_PERSIST))) {
138148
/* protect rcache access */
139-
OPAL_THREAD_LOCK(&ep->lock);
140149
mca_rcache_base_vma_delete (vma_module, reg);
141-
OPAL_THREAD_UNLOCK(&ep->lock);
142150

143151
opal_memchecker_base_mem_noaccess (reg->rcache_context, (uintptr_t)(reg->bound - reg->base));
144152
(void)xpmem_detach (reg->rcache_context);
145153
OBJ_RELEASE (reg);
146154
}
147155
}
148156

157+
static int mca_btl_vader_endpoint_xpmem_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
158+
{
159+
mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
160+
mca_btl_vader_endpoint_t *ep = (mca_btl_vader_endpoint_t *) ctx;
161+
if ((intptr_t) reg->alloc_base == ep->peer_smp_rank) {
162+
/* otherwise dereg will fail on assert */
163+
reg->ref_count = 0;
164+
(void) mca_rcache_base_vma_delete (vma_module, reg);
165+
OBJ_RELEASE(reg);
166+
}
167+
168+
return OPAL_SUCCESS;
169+
}
170+
171+
void mca_btl_vader_xpmem_cleanup_endpoint (struct mca_btl_base_endpoint_t *ep)
172+
{
173+
/* clean out the registration cache */
174+
(void) mca_rcache_base_vma_iterate (mca_btl_vader_component.vma_module,
175+
NULL, (size_t) -1,
176+
mca_btl_vader_endpoint_xpmem_rcache_cleanup,
177+
(void *) ep);
178+
if (ep->segment_base) {
179+
xpmem_release (ep->segment_data.xpmem.apid);
180+
ep->segment_data.xpmem.apid = 0;
181+
}
182+
}
183+
149184
#endif /* OPAL_BTL_VADER_HAVE_XPMEM */

opal/mca/btl/vader/btl_vader_xpmem.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,15 @@
3939
#define VADER_MAX_ADDRESS XPMEM_MAXADDR_SIZE
4040
#endif
4141

42+
struct mca_btl_base_endpoint_t;
4243

4344
int mca_btl_vader_xpmem_init (void);
4445

4546
mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr,
4647
size_t size, int flags, void **local_ptr);
4748

4849
void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint);
50+
void mca_btl_vader_xpmem_cleanup_endpoint (struct mca_btl_base_endpoint_t *ep);
4951

5052
#else
5153

0 commit comments

Comments
 (0)