Skip to content

Commit 9d92075

Browse files
hjelmnbosilca
authored andcommitted
btl/self: rewrite to decrease memory usage (#2307)
This commit rewrites much of the btl/self component to fix a long standing memory usage bug. Before this commit the prepare_src path would always allocate a max send fragment (256kB). This caused the rank to allocate 32 * 256k useless buffers from one send. This commit makes the following changes: - Add the MCA_BTL_FLAGS_GET flag by default. No reason not to set it. - Reduce the eager limit, max send size, buffers per allocation, and maximum buffer count per fragment size. These changes should have no noticible affect on performance but should greatly reduce the memory usage of the component. - Implement the sendi function. This should reduce self send latency somewhat. - Rewrite prepare_src to never allocate a eager or max send fragment for contiguous data. - add_procs needs to return something in the peer array for the proc self not just set the reachability bit. Now stores (void *) 1. - Various cleanups. Removed and unused file. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 83e3323 commit 9d92075

File tree

6 files changed

+237
-431
lines changed

6 files changed

+237
-431
lines changed

opal/mca/btl/self/btl_self.c

Lines changed: 132 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
1313
* Copyright (c) 2012-2013 Inria. All rights reserved.
14-
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
14+
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
1515
* reserved.
1616
* $COPYRIGHT$
1717
*
@@ -24,68 +24,55 @@
2424

2525
#include <string.h>
2626
#include <stdlib.h>
27-
#include <sys/types.h>
28-
#include <sys/stat.h>
29-
#include <fcntl.h>
30-
#include <errno.h>
3127

3228
#include "opal/class/opal_bitmap.h"
3329
#include "opal/datatype/opal_convertor.h"
34-
#include "opal/sys/atomic.h"
35-
#include "opal/mca/btl/btl.h"
36-
#include "opal/mca/mpool/base/base.h"
3730
#include "btl_self.h"
3831
#include "btl_self_frag.h"
3932
#include "opal/util/proc.h"
4033

41-
static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
42-
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
43-
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
44-
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
45-
46-
static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
47-
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
48-
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
49-
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
50-
51-
mca_btl_base_module_t mca_btl_self = {
52-
.btl_component = &mca_btl_self_component.super,
53-
.btl_add_procs = mca_btl_self_add_procs,
54-
.btl_del_procs = mca_btl_self_del_procs,
55-
.btl_finalize = mca_btl_self_finalize,
56-
.btl_alloc = mca_btl_self_alloc,
57-
.btl_free = mca_btl_self_free,
58-
.btl_prepare_src = mca_btl_self_prepare_src,
59-
.btl_send = mca_btl_self_send,
60-
.btl_put = mca_btl_self_put,
61-
.btl_get = mca_btl_self_get,
62-
.btl_dump = mca_btl_base_dump,
63-
.btl_ft_event = mca_btl_self_ft_event,
64-
};
65-
66-
67-
int mca_btl_self_add_procs( struct mca_btl_base_module_t* btl,
68-
size_t nprocs,
69-
struct opal_proc_t **procs,
70-
struct mca_btl_base_endpoint_t **peers,
71-
opal_bitmap_t* reachability )
34+
/**
35+
* PML->BTL notification of change in the process list.
36+
* PML->BTL Notification that a receive fragment has been matched.
37+
* Called for message that is send from process with the virtual
38+
* address of the shared memory segment being different than that of
39+
* the receiver.
40+
*
41+
* @param btl (IN)
42+
* @param proc (IN)
43+
* @param peer (OUT)
44+
* @return OPAL_SUCCESS or error status on failure.
45+
*
46+
*/
47+
static int mca_btl_self_add_procs (struct mca_btl_base_module_t *btl, size_t nprocs,
48+
struct opal_proc_t **procs,
49+
struct mca_btl_base_endpoint_t **peers,
50+
opal_bitmap_t* reachability)
7251
{
73-
int i;
74-
75-
for( i = 0; i < (int)nprocs; i++ ) {
52+
for (int i = 0; i < (int)nprocs; i++ ) {
7653
if( 0 == opal_compare_proc(procs[i]->proc_name, OPAL_PROC_MY_NAME) ) {
7754
opal_bitmap_set_bit( reachability, i );
55+
/* need to return something to keep the bml from ignoring us */
56+
peers[i] = (struct mca_btl_base_endpoint_t *) 1;
7857
break; /* there will always be only one ... */
7958
}
8059
}
60+
8161
return OPAL_SUCCESS;
8262
}
8363

84-
85-
int mca_btl_self_del_procs( struct mca_btl_base_module_t* btl,
86-
size_t nprocs,
87-
struct opal_proc_t **procs,
88-
struct mca_btl_base_endpoint_t **peers )
64+
/**
65+
* PML->BTL notification of change in the process list.
66+
*
67+
* @param btl (IN) BTL instance
68+
* @param proc (IN) Peer process
69+
* @param peer (IN) Peer addressing information.
70+
* @return Status indicating if cleanup was successful
71+
*
72+
*/
73+
static int mca_btl_self_del_procs (struct mca_btl_base_module_t *btl, size_t nprocs,
74+
struct opal_proc_t **procs,
75+
struct mca_btl_base_endpoint_t **peers)
8976
{
9077
return OPAL_SUCCESS;
9178
}
@@ -104,7 +91,7 @@ int mca_btl_self_del_procs( struct mca_btl_base_module_t* btl,
10491
*
10592
*/
10693

107-
int mca_btl_self_finalize(struct mca_btl_base_module_t* btl)
94+
static int mca_btl_self_finalize(struct mca_btl_base_module_t* btl)
10895
{
10996
return OPAL_SUCCESS;
11097
}
@@ -116,29 +103,29 @@ int mca_btl_self_finalize(struct mca_btl_base_module_t* btl)
116103
* @param btl (IN) BTL module
117104
* @param size (IN) Request segment size.
118105
*/
119-
mca_btl_base_descriptor_t* mca_btl_self_alloc(
120-
struct mca_btl_base_module_t* btl,
121-
struct mca_btl_base_endpoint_t* endpoint,
122-
uint8_t order,
123-
size_t size,
124-
uint32_t flags)
106+
static mca_btl_base_descriptor_t *mca_btl_self_alloc (struct mca_btl_base_module_t *btl,
107+
struct mca_btl_base_endpoint_t *endpoint,
108+
uint8_t order, size_t size, uint32_t flags)
125109
{
126-
mca_btl_self_frag_t* frag = NULL;
110+
mca_btl_self_frag_t *frag = NULL;
127111

128-
if(size <= mca_btl_self.btl_eager_limit) {
112+
if (size <= MCA_BTL_SELF_MAX_INLINE_SIZE) {
113+
MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag);
114+
} else if (size <= mca_btl_self.btl_eager_limit) {
129115
MCA_BTL_SELF_FRAG_ALLOC_EAGER(frag);
130116
} else if (size <= btl->btl_max_send_size) {
131117
MCA_BTL_SELF_FRAG_ALLOC_SEND(frag);
132118
}
119+
133120
if( OPAL_UNLIKELY(NULL == frag) ) {
134121
return NULL;
135122
}
136123

137-
frag->segment.seg_len = size;
138-
frag->base.des_flags = flags;
139-
frag->base.des_segments = &(frag->segment);
124+
frag->segments[0].seg_len = size;
140125
frag->base.des_segment_count = 1;
141-
return (mca_btl_base_descriptor_t*)frag;
126+
frag->base.des_flags = flags;
127+
128+
return &frag->base;
142129
}
143130

144131
/**
@@ -147,90 +134,57 @@ mca_btl_base_descriptor_t* mca_btl_self_alloc(
147134
* @param btl (IN) BTL module
148135
* @param segment (IN) Allocated segment.
149136
*/
150-
int mca_btl_self_free( struct mca_btl_base_module_t* btl,
151-
mca_btl_base_descriptor_t* des )
137+
static int mca_btl_self_free (struct mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des)
152138
{
153-
mca_btl_self_frag_t* frag = (mca_btl_self_frag_t*)des;
154-
155-
frag->base.des_segments = NULL;
156-
frag->base.des_segment_count = 0;
139+
MCA_BTL_SELF_FRAG_RETURN((mca_btl_self_frag_t *) des);
157140

158-
if(frag->size == mca_btl_self.btl_eager_limit) {
159-
MCA_BTL_SELF_FRAG_RETURN_EAGER(frag);
160-
} else if (frag->size == mca_btl_self.btl_max_send_size) {
161-
MCA_BTL_SELF_FRAG_RETURN_SEND(frag);
162-
} else {
163-
MCA_BTL_SELF_FRAG_RETURN_RDMA(frag);
164-
}
165141
return OPAL_SUCCESS;
166142
}
167143

168144

169145
/**
170-
* Prepare data for send/put
146+
* Prepare data for send
171147
*
172148
* @param btl (IN) BTL module
173149
*/
174-
struct mca_btl_base_descriptor_t*
175-
mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
176-
struct mca_btl_base_endpoint_t* endpoint,
177-
struct opal_convertor_t* convertor,
178-
uint8_t order,
179-
size_t reserve,
180-
size_t* size,
181-
uint32_t flags )
150+
static struct mca_btl_base_descriptor_t *mca_btl_self_prepare_src (struct mca_btl_base_module_t* btl,
151+
struct mca_btl_base_endpoint_t *endpoint,
152+
struct opal_convertor_t *convertor,
153+
uint8_t order, size_t reserve,
154+
size_t *size, uint32_t flags)
182155
{
183-
mca_btl_self_frag_t* frag;
184-
struct iovec iov;
185-
uint32_t iov_count = 1;
186-
size_t max_data = *size;
187-
int rc;
188-
189-
/* non-contigous data */
190-
if( opal_convertor_need_buffers(convertor) ||
191-
max_data < mca_btl_self.btl_max_send_size ||
192-
reserve != 0 ) {
156+
bool inline_send = !opal_convertor_need_buffers(convertor);
157+
size_t buffer_len = reserve + (inline_send ? 0 : *size);
158+
mca_btl_self_frag_t *frag;
193159

194-
MCA_BTL_SELF_FRAG_ALLOC_SEND(frag);
195-
if(OPAL_UNLIKELY(NULL == frag)) {
196-
return NULL;
197-
}
160+
frag = (mca_btl_self_frag_t *) mca_btl_self_alloc (btl, endpoint, order, buffer_len, flags);
161+
if (OPAL_UNLIKELY(NULL == frag)) {
162+
return NULL;
163+
}
198164

199-
if(reserve + max_data > frag->size) {
200-
max_data = frag->size - reserve;
201-
}
202-
iov.iov_len = max_data;
203-
iov.iov_base = (IOVBASE_TYPE*)((unsigned char*)(frag+1) + reserve);
165+
/* non-contigous data */
166+
if (OPAL_UNLIKELY(!inline_send)) {
167+
struct iovec iov = {.iov_len = *size, .iov_base = (IOVBASE_TYPE *) ((uintptr_t) frag->data + reserve)};
168+
size_t max_data = *size;
169+
uint32_t iov_count = 1;
170+
int rc;
204171

205-
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
172+
rc = opal_convertor_pack (convertor, &iov, &iov_count, &max_data);
206173
if(rc < 0) {
207-
MCA_BTL_SELF_FRAG_RETURN_SEND(frag);
174+
mca_btl_self_free (btl, &frag->base);
208175
return NULL;
209176
}
210-
frag->segment.seg_addr.pval = frag+1;
211-
frag->segment.seg_len = reserve + max_data;
177+
212178
*size = max_data;
213179
} else {
214-
MCA_BTL_SELF_FRAG_ALLOC_RDMA(frag);
215-
if(OPAL_UNLIKELY(NULL == frag)) {
216-
return NULL;
217-
}
218-
iov.iov_len = max_data;
219-
iov.iov_base = NULL;
180+
void *data_ptr;
220181

221-
/* convertor should return offset into users buffer */
222-
rc = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
223-
if(rc < 0) {
224-
MCA_BTL_SELF_FRAG_RETURN_RDMA(frag);
225-
return NULL;
226-
}
227-
frag->segment.seg_addr.lval = (uint64_t)(uintptr_t) iov.iov_base;
228-
frag->segment.seg_len = max_data;
229-
*size = max_data;
182+
opal_convertor_get_current_pointer (convertor, &data_ptr);
183+
184+
frag->segments[1].seg_addr.pval = data_ptr;
185+
frag->segments[1].seg_len = *size;
186+
frag->base.des_segment_count = 2;
230187
}
231-
frag->base.des_flags = flags;
232-
frag->base.des_segments = &frag->segment;
233-
frag->base.des_segment_count = 1;
234188

235189
return &frag->base;
236190
}
@@ -242,10 +196,10 @@ mca_btl_self_prepare_src( struct mca_btl_base_module_t* btl,
242196
* @param peer (IN) BTL peer addressing
243197
*/
244198

245-
int mca_btl_self_send( struct mca_btl_base_module_t* btl,
246-
struct mca_btl_base_endpoint_t* endpoint,
247-
struct mca_btl_base_descriptor_t* des,
248-
mca_btl_base_tag_t tag )
199+
static int mca_btl_self_send (struct mca_btl_base_module_t *btl,
200+
struct mca_btl_base_endpoint_t *endpoint,
201+
struct mca_btl_base_descriptor_t *des,
202+
mca_btl_base_tag_t tag)
249203
{
250204
mca_btl_active_message_callback_t* reg;
251205
int btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
@@ -264,6 +218,39 @@ int mca_btl_self_send( struct mca_btl_base_module_t* btl,
264218
return 1;
265219
}
266220

221+
static int mca_btl_self_sendi (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
222+
struct opal_convertor_t *convertor, void *header, size_t header_size,
223+
size_t payload_size, uint8_t order, uint32_t flags, mca_btl_base_tag_t tag,
224+
mca_btl_base_descriptor_t **descriptor)
225+
{
226+
mca_btl_base_descriptor_t *frag;
227+
228+
if (!payload_size || !opal_convertor_need_buffers(convertor)) {
229+
void *data_ptr = NULL;
230+
if (payload_size) {
231+
opal_convertor_get_current_pointer (convertor, &data_ptr);
232+
}
233+
234+
mca_btl_base_segment_t segments[2] = {{.seg_addr.pval = header, .seg_len = header_size},
235+
{.seg_addr.pval = data_ptr, .seg_len = payload_size}};
236+
mca_btl_base_descriptor_t des = {.des_segments = segments, .des_segment_count = payload_size ? 2 : 1,
237+
.des_flags = 0};
238+
239+
(void) mca_btl_self_send (btl, endpoint, &des, tag);
240+
return OPAL_SUCCESS;
241+
}
242+
243+
frag = mca_btl_self_prepare_src (btl, endpoint, convertor, order, header_size, &payload_size,
244+
flags | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
245+
if (NULL == frag) {
246+
*descriptor = NULL;
247+
return OPAL_ERR_OUT_OF_RESOURCE;
248+
}
249+
250+
memcpy (frag->des_segments[0].seg_addr.pval, header, header_size);
251+
(void) mca_btl_self_send (btl, endpoint, frag, tag);
252+
return OPAL_SUCCESS;
253+
}
267254

268255
static int mca_btl_self_put (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address,
269256
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
@@ -289,22 +276,23 @@ static int mca_btl_self_get (mca_btl_base_module_t *btl, struct mca_btl_base_end
289276
return OPAL_SUCCESS;
290277
}
291278

292-
int mca_btl_self_ft_event(int state) {
293-
if(OPAL_CRS_CHECKPOINT == state) {
294-
;
295-
}
296-
else if(OPAL_CRS_CONTINUE == state) {
297-
;
298-
}
299-
else if(OPAL_CRS_RESTART == state) {
300-
;
301-
}
302-
else if(OPAL_CRS_TERM == state ) {
303-
;
304-
}
305-
else {
306-
;
307-
}
308-
279+
static int mca_btl_self_ft_event(int state) {
309280
return OPAL_SUCCESS;
310281
}
282+
283+
/* btl self module */
284+
mca_btl_base_module_t mca_btl_self = {
285+
.btl_component = &mca_btl_self_component.super,
286+
.btl_add_procs = mca_btl_self_add_procs,
287+
.btl_del_procs = mca_btl_self_del_procs,
288+
.btl_finalize = mca_btl_self_finalize,
289+
.btl_alloc = mca_btl_self_alloc,
290+
.btl_free = mca_btl_self_free,
291+
.btl_prepare_src = mca_btl_self_prepare_src,
292+
.btl_send = mca_btl_self_send,
293+
.btl_sendi = mca_btl_self_sendi,
294+
.btl_put = mca_btl_self_put,
295+
.btl_get = mca_btl_self_get,
296+
.btl_dump = mca_btl_base_dump,
297+
.btl_ft_event = mca_btl_self_ft_event,
298+
};

0 commit comments

Comments
 (0)