Skip to content

Commit 46ddf54

Browse files
committed
Clean and sync the pack and unpack functions.
- optimize handling of contiguous with gaps datatypes. - fixes a performance issue for all datatypes with a count of 1. - optimize the pack/unpack of contiguous with gaps datatype. - optimize the case of blocklen == 1 Signed-off-by: George Bosilca <[email protected]>
1 parent d335eea commit 46ddf54

8 files changed

+314
-351
lines changed

opal/datatype/opal_convertor_raw.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@
3131
#endif /* OPAL_ENABLE_DEBUG */
3232

3333
/* Take a new iovec (base + len) and try to merge it with what we already
34-
* have. If we succeed return 0 and move forward, if not save it into a new
35-
* iovec location. If we need to go to a new position and we reach the end
34+
* have. If we succeed return 0 and move forward, otherwise save it into a new
35+
* iovec location. If we need to advance position and we reach the end
3636
* of the iovec array, return 1 to signal we did not saved the last iovec.
3737
*/
3838
static inline int
@@ -46,7 +46,7 @@ opal_convertor_merge_iov( struct iovec* iov, uint32_t* iov_count,
4646
return 0;
4747
} /* cannot merge, move to the next position */
4848
*idx = *idx + 1;
49-
if( *idx == *iov_count ) return 1; /* do not overwrite outside the iove array boundaries */
49+
if( *idx == *iov_count ) return 1; /* do not overwrite outside the iovec array boundaries */
5050
}
5151
iov[*idx].iov_base = base;
5252
iov[*idx].iov_len = len;

opal/datatype/opal_datatype_copy.h

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,9 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM,
5151
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
5252
unsigned char* _source = (SOURCE) + _elem->disp;
5353
unsigned char* _destination = (DESTINATION) + _elem->disp;
54-
size_t total_count = _elem->count * _elem->blocklen;
55-
size_t do_now, do_now_bytes;
54+
size_t do_now = _elem->count, do_now_bytes;
5655

57-
assert( (COUNT) == total_count);
58-
assert( total_count <= ((*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size) );
56+
assert( (COUNT) == (do_now * _elem->blocklen));
5957

6058
/* We don't a prologue and epilogue here as we are __always__ working
6159
* with full copies of the data description.
@@ -64,21 +62,19 @@ static inline void _predefined_data( const dt_elem_desc_t* ELEM,
6462
/**
6563
* Compute how many full blocklen we need to do and do them.
6664
*/
67-
do_now = _elem->count;
68-
if( 0 != do_now ) {
69-
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
70-
for(size_t _i = 0; _i < do_now; _i++ ) {
71-
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE),
72-
(DATATYPE), (TOTAL_COUNT) );
73-
DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n",
74-
STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) ); );
75-
MEM_OP( _destination, _source, do_now_bytes );
76-
_destination += _elem->extent;
77-
_source += _elem->extent;
78-
*(SPACE) -= do_now_bytes;
79-
}
80-
(COUNT) -= total_count;
65+
do_now_bytes = _elem->blocklen * opal_datatype_basicDatatypes[_elem->common.type]->size;
66+
assert( (do_now * do_now_bytes) <= (*SPACE) );
67+
68+
for(size_t _i = 0; _i < do_now; _i++ ) {
69+
OPAL_DATATYPE_SAFEGUARD_POINTER( _source, do_now_bytes, (SOURCE_BASE),
70+
(DATATYPE), (TOTAL_COUNT) );
71+
DO_DEBUG( opal_output( 0, "copy %s( %p, %p, %" PRIsize_t " ) => space %" PRIsize_t "\n",
72+
STRINGIFY(MEM_OP_NAME), (void*)_destination, (void*)_source, do_now_bytes, *(SPACE) - _i * do_now_bytes ); );
73+
MEM_OP( _destination, _source, do_now_bytes );
74+
_destination += _elem->extent;
75+
_source += _elem->extent;
8176
}
77+
*(SPACE) -= (do_now_bytes * do_now);
8278
}
8379

8480
static inline void _contiguous_loop( const dt_elem_desc_t* ELEM,

opal/datatype/opal_datatype_module.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,6 @@ int32_t opal_datatype_init( void )
252252
OPAL_DATATYPE_FLAG_CONTIGUOUS |
253253
OPAL_DATATYPE_FLAG_NO_GAPS;
254254
datatype->desc.desc[0].elem.common.type = i;
255-
/* datatype->desc.desc[0].elem.blocklen XXX not set at the moment, it will be needed later */
256255
datatype->desc.desc[0].elem.count = 1;
257256
datatype->desc.desc[0].elem.blocklen = 1;
258257
datatype->desc.desc[0].elem.disp = 0;

opal/datatype/opal_datatype_pack.c

Lines changed: 88 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2019 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -53,8 +53,6 @@
5353
#endif /* defined(CHECKSUM) */
5454

5555

56-
#define IOVEC_MEM_LIMIT 8192
57-
5856
/* the contig versions does not use the stack. They can easily retrieve
5957
* the status with just the informations from pConvertor->bConverted.
6058
*/
@@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv,
6866
unsigned char *source_base = NULL;
6967
uint32_t iov_count;
7068
size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted;
71-
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
7269

73-
source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp);
70+
source_base = (pConv->pBaseBuf + pConv->pDesc->true_lb + pStack[0].disp + pStack[1].disp);
7471

7572
/* There are some optimizations that can be done if the upper level
7673
* does not provide a buffer.
@@ -111,155 +108,116 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
111108
uint32_t* out_size,
112109
size_t* max_data )
113110
{
111+
size_t remaining, length, initial_bytes_converted = pConv->bConverted;
114112
const opal_datatype_t* pData = pConv->pDesc;
115113
dt_stack_t* stack = pConv->pStack;
114+
ptrdiff_t extent = pData->ub - pData->lb;
116115
unsigned char *user_memory, *packed_buffer;
117-
uint32_t iov_count, index;
116+
uint32_t idx;
118117
size_t i;
119-
size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted;
120-
ptrdiff_t extent= pData->ub - pData->lb;
121-
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;
122118

119+
/* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb
120+
* is the initial displacement, the size the length of the contiguous area and the extent represent
121+
* how much we should jump between elements.
122+
*/
123123
assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) );
124124
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n",
125125
(void*)pConv->pBaseBuf, *out_size ); );
126126
if( stack[1].type != opal_datatype_uint1.id ) {
127127
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
128-
stack[1].type = opal_datatype_uint1.id;
128+
stack[1].type = opal_datatype_uint1.id;
129+
}
130+
/* We can provide directly the pointers in the user buffers (like the convertor_raw) */
131+
if( NULL == iov[0].iov_base ) {
132+
user_memory = pConv->pBaseBuf + pData->true_lb;
133+
134+
for( idx = 0; (idx < (*out_size)) && stack[0].count; idx++ ) {
135+
iov[idx].iov_base = user_memory + stack[0].disp + stack[1].disp;
136+
iov[idx].iov_len = stack[1].count;
137+
COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv );
138+
139+
pConv->bConverted += stack[1].count;
140+
141+
stack[0].disp += extent;
142+
stack[0].count--;
143+
stack[1].disp = 0;
144+
stack[1].count = pData->size; /* we might need this to update the partial
145+
* length for the first iteration */
146+
}
147+
goto update_status_and_return;
129148
}
130149

131-
/* There are some optimizations that can be done if the upper level
132-
* does not provide a buffer.
133-
*/
134-
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
150+
for( idx = 0; idx < (*out_size); idx++ ) {
135151
/* Limit the amount of packed data to the data left over on this convertor */
136152
remaining = pConv->local_size - pConv->bConverted;
137153
if( 0 == remaining ) break; /* we're done this time */
138-
if( remaining > iov[iov_count].iov_len )
139-
remaining = iov[iov_count].iov_len;
140-
packed_buffer = (unsigned char *)iov[iov_count].iov_base;
141-
bConverted = remaining; /* how much will get unpacked this time */
142-
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp + stack[1].disp;
143-
i = pConv->count - stack[0].count; /* how many we already packed */
144-
assert(i == (pConv->bConverted / pData->size));
145-
146-
if( packed_buffer == NULL ) {
147-
/* special case for small data. We avoid allocating memory if we
148-
* can fill the iovec directly with the address of the remaining
149-
* data.
150-
*/
151-
if( stack->count < (size_t)((*out_size) - iov_count) ) {
152-
stack[1].count = pData->size - (pConv->bConverted % pData->size);
153-
for( index = iov_count; i < pConv->count; i++, index++ ) {
154-
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
155-
iov[index].iov_len = stack[1].count;
156-
stack[0].disp += extent;
157-
pConv->bConverted += stack[1].count;
158-
stack[1].disp = 0; /* reset it for the next round */
159-
stack[1].count = pData->size;
160-
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp;
161-
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
162-
}
163-
*out_size = iov_count + index;
164-
*max_data = (pConv->bConverted - initial_bytes_converted);
165-
pConv->flags |= CONVERTOR_COMPLETED;
166-
return 1; /* we're done */
167-
}
168-
/* now special case for big contiguous data with gaps around */
169-
if( pData->size >= IOVEC_MEM_LIMIT ) {
170-
/* as we dont have to copy any data, we can simply fill the iovecs
171-
* with data from the user data description.
172-
*/
173-
for( index = iov_count; (i < pConv->count) && (index < (*out_size));
174-
i++, index++ ) {
175-
if( remaining < pData->size ) {
176-
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
177-
iov[index].iov_len = remaining;
178-
remaining = 0;
179-
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
180-
break;
181-
} else {
182-
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
183-
iov[index].iov_len = pData->size;
184-
user_memory += extent;
185-
COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv );
186-
}
187-
remaining -= iov[index].iov_len;
188-
pConv->bConverted += iov[index].iov_len;
189-
}
190-
*out_size = index;
191-
*max_data = (pConv->bConverted - initial_bytes_converted);
192-
if( pConv->bConverted == pConv->local_size ) {
193-
pConv->flags |= CONVERTOR_COMPLETED;
194-
return 1;
195-
}
196-
return 0;
154+
if( remaining > iov[idx].iov_len )
155+
remaining = iov[idx].iov_len;
156+
packed_buffer = (unsigned char *)iov[idx].iov_base;
157+
pConv->bConverted += remaining;
158+
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
159+
160+
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n",
161+
(void*)user_memory, (void*)packed_buffer, remaining ); );
162+
163+
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */
164+
/* data left from last round and enough space in the buffer */
165+
if( (pData->size != length) && (length <= remaining)) {
166+
/* copy the partial left-over from the previous round */
167+
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
168+
pData, pConv->count );
169+
DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [prologue]\n",
170+
(void*)user_memory, (void*)packed_buffer, length ); );
171+
MEMCPY_CSUM( packed_buffer, user_memory, length, pConv );
172+
packed_buffer += length;
173+
remaining -= length;
174+
stack[1].count -= length;
175+
stack[1].disp += length; /* just in case, we overwrite this below */
176+
if( 0 == stack[1].count) { /* one completed element */
177+
stack[0].count--;
178+
stack[0].disp += extent;
179+
if( 0 == stack[0].count ) /* not yet done */
180+
break;
181+
stack[1].count = pData->size;
182+
stack[1].disp = 0;
197183
}
184+
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
198185
}
199186

200-
{
201-
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n",
202-
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); );
203-
204-
length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */
205-
/* data left from last round and enough space in the buffer */
206-
if( (0 != length) && (length <= remaining)) {
207-
/* copy the partial left-over from the previous round */
208-
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
209-
pData, pConv->count );
210-
DO_DEBUG( opal_output( 0, "2. pack dest %p src %p length %lu\n",
211-
(void*)user_memory, (void*)packed_buffer, (unsigned long)length ); );
212-
MEMCPY_CSUM( packed_buffer, user_memory, length, pConv );
213-
packed_buffer += length;
214-
user_memory += (extent - pData->size + length);
215-
remaining -= length;
216-
stack[1].count -= length;
217-
if( 0 == stack[1].count) { /* one completed element */
218-
stack[0].count--;
219-
stack[0].disp += extent;
220-
if( 0 != stack[0].count ) { /* not yet done */
221-
stack[1].count = pData->size;
222-
stack[1].disp = 0;
223-
}
224-
}
225-
}
226-
for( i = 0; pData->size <= remaining; i++ ) {
227-
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
228-
pData, pConv->count );
229-
DO_DEBUG( opal_output( 0, "3. pack dest %p src %p length %lu\n",
230-
(void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); );
231-
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
232-
packed_buffer += pData->size;
233-
user_memory += extent;
234-
remaining -= pData->size;
235-
}
236-
stack[0].count -= i; /* the filled up and the entire types */
237-
stack[0].disp += (i * extent);
238-
stack[1].disp += remaining;
239-
/* Copy the last bits */
240-
if( 0 != remaining ) {
241-
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
242-
pData, pConv->count );
243-
DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %lu\n",
244-
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); );
245-
MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv );
246-
user_memory += remaining;
247-
stack[1].count -= remaining;
248-
}
187+
for( i = 0; pData->size <= remaining; i++ ) {
188+
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
189+
pData, pConv->count );
190+
DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n",
191+
(void*)user_memory, (void*)packed_buffer, pData->size, remaining, iov[idx].iov_len ); );
192+
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
193+
packed_buffer += pData->size;
194+
user_memory += extent;
195+
remaining -= pData->size;
196+
}
197+
stack[0].count -= i; /* the entire datatype copied above */
198+
stack[0].disp += (i * extent);
199+
200+
/* Copy the last bits */
201+
if( 0 != remaining ) {
202+
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
203+
pData, pConv->count );
204+
DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %" PRIsize_t "\n",
205+
(void*)user_memory, (void*)packed_buffer, remaining ); );
206+
MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv );
207+
stack[1].count -= remaining;
208+
stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */
249209
if( 0 == stack[1].count ) { /* prepare for the next element */
250210
stack[1].count = pData->size;
251211
stack[1].disp = 0;
252212
}
253213
}
254-
pConv->bConverted += bConverted;
255214
}
256-
*out_size = iov_count;
257-
*max_data = (pConv->bConverted - initial_bytes_converted);
258-
if( pConv->bConverted == pConv->local_size ) {
259-
pConv->flags |= CONVERTOR_COMPLETED;
260-
return 1;
261-
}
262-
return 0;
215+
216+
update_status_and_return:
217+
*out_size = idx;
218+
*max_data = pConv->bConverted - initial_bytes_converted;
219+
if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED;
220+
return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */
263221
}
264222

265223
/* The pack/unpack functions need a cleanup. I have to create a proper interface to access

0 commit comments

Comments
 (0)