3
3
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
4
4
* University Research and Technology
5
5
* Corporation. All rights reserved.
6
- * Copyright (c) 2004-2016 The University of Tennessee and The University
6
+ * Copyright (c) 2004-2019 The University of Tennessee and The University
7
7
* of Tennessee Research Foundation. All rights
8
8
* reserved.
9
9
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
53
53
#endif /* defined(CHECKSUM) */
54
54
55
55
56
- #define IOVEC_MEM_LIMIT 8192
57
-
58
56
/* the contig versions does not use the stack. They can easily retrieve
59
57
* the status with just the informations from pConvertor->bConverted.
60
58
*/
@@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv,
68
66
unsigned char * source_base = NULL ;
69
67
uint32_t iov_count ;
70
68
size_t length = pConv -> local_size - pConv -> bConverted , initial_amount = pConv -> bConverted ;
71
- ptrdiff_t initial_displ = pConv -> use_desc -> desc [pConv -> use_desc -> used ].end_loop .first_elem_disp ;
72
69
73
- source_base = (pConv -> pBaseBuf + initial_displ + pStack [0 ].disp + pStack [1 ].disp );
70
+ source_base = (pConv -> pBaseBuf + pConv -> pDesc -> true_lb + pStack [0 ].disp + pStack [1 ].disp );
74
71
75
72
/* There are some optimizations that can be done if the upper level
76
73
* does not provide a buffer.
@@ -111,155 +108,116 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
111
108
uint32_t * out_size ,
112
109
size_t * max_data )
113
110
{
111
+ size_t remaining , length , initial_bytes_converted = pConv -> bConverted ;
114
112
const opal_datatype_t * pData = pConv -> pDesc ;
115
113
dt_stack_t * stack = pConv -> pStack ;
114
+ ptrdiff_t extent = pData -> ub - pData -> lb ;
116
115
unsigned char * user_memory , * packed_buffer ;
117
- uint32_t iov_count , index ;
116
+ uint32_t idx ;
118
117
size_t i ;
119
- size_t bConverted , remaining , length , initial_bytes_converted = pConv -> bConverted ;
120
- ptrdiff_t extent = pData -> ub - pData -> lb ;
121
- ptrdiff_t initial_displ = pConv -> use_desc -> desc [pConv -> use_desc -> used ].end_loop .first_elem_disp ;
122
118
119
+ /* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb
120
+ * is the initial displacement, the size the length of the contiguous area and the extent represent
121
+ * how much we should jump between elements.
122
+ */
123
123
assert ( (pData -> flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) && ((ptrdiff_t )pData -> size != extent ) );
124
124
DO_DEBUG ( opal_output ( 0 , "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n" ,
125
125
(void * )pConv -> pBaseBuf , * out_size ); );
126
126
if ( stack [1 ].type != opal_datatype_uint1 .id ) {
127
127
stack [1 ].count *= opal_datatype_basicDatatypes [stack [1 ].type ]-> size ;
128
- stack [1 ].type = opal_datatype_uint1 .id ;
128
+ stack [1 ].type = opal_datatype_uint1 .id ;
129
+ }
130
+ /* We can provide directly the pointers in the user buffers (like the convertor_raw) */
131
+ if ( NULL == iov [0 ].iov_base ) {
132
+ user_memory = pConv -> pBaseBuf + pData -> true_lb ;
133
+
134
+ for ( idx = 0 ; (idx < (* out_size )) && stack [0 ].count ; idx ++ ) {
135
+ iov [idx ].iov_base = user_memory + stack [0 ].disp + stack [1 ].disp ;
136
+ iov [idx ].iov_len = stack [1 ].count ;
137
+ COMPUTE_CSUM ( iov [idx ].iov_base , iov [idx ].iov_len , pConv );
138
+
139
+ pConv -> bConverted += stack [1 ].count ;
140
+
141
+ stack [0 ].disp += extent ;
142
+ stack [0 ].count -- ;
143
+ stack [1 ].disp = 0 ;
144
+ stack [1 ].count = pData -> size ; /* we might need this to update the partial
145
+ * length for the first iteration */
146
+ }
147
+ goto update_status_and_return ;
129
148
}
130
149
131
- /* There are some optimizations that can be done if the upper level
132
- * does not provide a buffer.
133
- */
134
- for ( iov_count = 0 ; iov_count < (* out_size ); iov_count ++ ) {
150
+ for ( idx = 0 ; idx < (* out_size ); idx ++ ) {
135
151
/* Limit the amount of packed data to the data left over on this convertor */
136
152
remaining = pConv -> local_size - pConv -> bConverted ;
137
153
if ( 0 == remaining ) break ; /* we're done this time */
138
- if ( remaining > iov [iov_count ].iov_len )
139
- remaining = iov [iov_count ].iov_len ;
140
- packed_buffer = (unsigned char * )iov [iov_count ].iov_base ;
141
- bConverted = remaining ; /* how much will get unpacked this time */
142
- user_memory = pConv -> pBaseBuf + initial_displ + stack [0 ].disp + stack [1 ].disp ;
143
- i = pConv -> count - stack [0 ].count ; /* how many we already packed */
144
- assert (i == (pConv -> bConverted / pData -> size ));
145
-
146
- if ( packed_buffer == NULL ) {
147
- /* special case for small data. We avoid allocating memory if we
148
- * can fill the iovec directly with the address of the remaining
149
- * data.
150
- */
151
- if ( stack -> count < (size_t )((* out_size ) - iov_count ) ) {
152
- stack [1 ].count = pData -> size - (pConv -> bConverted % pData -> size );
153
- for ( index = iov_count ; i < pConv -> count ; i ++ , index ++ ) {
154
- iov [index ].iov_base = (IOVBASE_TYPE * ) user_memory ;
155
- iov [index ].iov_len = stack [1 ].count ;
156
- stack [0 ].disp += extent ;
157
- pConv -> bConverted += stack [1 ].count ;
158
- stack [1 ].disp = 0 ; /* reset it for the next round */
159
- stack [1 ].count = pData -> size ;
160
- user_memory = pConv -> pBaseBuf + initial_displ + stack [0 ].disp ;
161
- COMPUTE_CSUM ( iov [index ].iov_base , iov [index ].iov_len , pConv );
162
- }
163
- * out_size = iov_count + index ;
164
- * max_data = (pConv -> bConverted - initial_bytes_converted );
165
- pConv -> flags |= CONVERTOR_COMPLETED ;
166
- return 1 ; /* we're done */
167
- }
168
- /* now special case for big contiguous data with gaps around */
169
- if ( pData -> size >= IOVEC_MEM_LIMIT ) {
170
- /* as we dont have to copy any data, we can simply fill the iovecs
171
- * with data from the user data description.
172
- */
173
- for ( index = iov_count ; (i < pConv -> count ) && (index < (* out_size ));
174
- i ++ , index ++ ) {
175
- if ( remaining < pData -> size ) {
176
- iov [index ].iov_base = (IOVBASE_TYPE * ) user_memory ;
177
- iov [index ].iov_len = remaining ;
178
- remaining = 0 ;
179
- COMPUTE_CSUM ( iov [index ].iov_base , iov [index ].iov_len , pConv );
180
- break ;
181
- } else {
182
- iov [index ].iov_base = (IOVBASE_TYPE * ) user_memory ;
183
- iov [index ].iov_len = pData -> size ;
184
- user_memory += extent ;
185
- COMPUTE_CSUM ( iov [index ].iov_base , (size_t )iov [index ].iov_len , pConv );
186
- }
187
- remaining -= iov [index ].iov_len ;
188
- pConv -> bConverted += iov [index ].iov_len ;
189
- }
190
- * out_size = index ;
191
- * max_data = (pConv -> bConverted - initial_bytes_converted );
192
- if ( pConv -> bConverted == pConv -> local_size ) {
193
- pConv -> flags |= CONVERTOR_COMPLETED ;
194
- return 1 ;
195
- }
196
- return 0 ;
154
+ if ( remaining > iov [idx ].iov_len )
155
+ remaining = iov [idx ].iov_len ;
156
+ packed_buffer = (unsigned char * )iov [idx ].iov_base ;
157
+ pConv -> bConverted += remaining ;
158
+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp + stack [1 ].disp ;
159
+
160
+ DO_DEBUG ( opal_output ( 0 , "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n" ,
161
+ (void * )user_memory , (void * )packed_buffer , remaining ); );
162
+
163
+ length = (0 == pConv -> stack_pos ? 0 : stack [1 ].count ); /* left over from the last pack */
164
+ /* data left from last round and enough space in the buffer */
165
+ if ( (pData -> size != length ) && (length <= remaining )) {
166
+ /* copy the partial left-over from the previous round */
167
+ OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , length , pConv -> pBaseBuf ,
168
+ pData , pConv -> count );
169
+ DO_DEBUG ( opal_output ( 0 , "pack dest %p src %p length %" PRIsize_t " [prologue]\n" ,
170
+ (void * )user_memory , (void * )packed_buffer , length ); );
171
+ MEMCPY_CSUM ( packed_buffer , user_memory , length , pConv );
172
+ packed_buffer += length ;
173
+ remaining -= length ;
174
+ stack [1 ].count -= length ;
175
+ stack [1 ].disp += length ; /* just in case, we overwrite this below */
176
+ if ( 0 == stack [1 ].count ) { /* one completed element */
177
+ stack [0 ].count -- ;
178
+ stack [0 ].disp += extent ;
179
+ if ( 0 == stack [0 ].count ) /* not yet done */
180
+ break ;
181
+ stack [1 ].count = pData -> size ;
182
+ stack [1 ].disp = 0 ;
197
183
}
184
+ user_memory = pConv -> pBaseBuf + pData -> true_lb + stack [0 ].disp + stack [1 ].disp ;
198
185
}
199
186
200
- {
201
- DO_DEBUG ( opal_output ( 0 , "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n" ,
202
- (void * )user_memory , (void * )packed_buffer , (unsigned long )remaining ); );
203
-
204
- length = (0 == pConv -> stack_pos ? 0 : stack [1 ].count ); /* left over from the last pack */
205
- /* data left from last round and enough space in the buffer */
206
- if ( (0 != length ) && (length <= remaining )) {
207
- /* copy the partial left-over from the previous round */
208
- OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , length , pConv -> pBaseBuf ,
209
- pData , pConv -> count );
210
- DO_DEBUG ( opal_output ( 0 , "2. pack dest %p src %p length %lu\n" ,
211
- (void * )user_memory , (void * )packed_buffer , (unsigned long )length ); );
212
- MEMCPY_CSUM ( packed_buffer , user_memory , length , pConv );
213
- packed_buffer += length ;
214
- user_memory += (extent - pData -> size + length );
215
- remaining -= length ;
216
- stack [1 ].count -= length ;
217
- if ( 0 == stack [1 ].count ) { /* one completed element */
218
- stack [0 ].count -- ;
219
- stack [0 ].disp += extent ;
220
- if ( 0 != stack [0 ].count ) { /* not yet done */
221
- stack [1 ].count = pData -> size ;
222
- stack [1 ].disp = 0 ;
223
- }
224
- }
225
- }
226
- for ( i = 0 ; pData -> size <= remaining ; i ++ ) {
227
- OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , pData -> size , pConv -> pBaseBuf ,
228
- pData , pConv -> count );
229
- DO_DEBUG ( opal_output ( 0 , "3. pack dest %p src %p length %lu\n" ,
230
- (void * )user_memory , (void * )packed_buffer , (unsigned long )pData -> size ); );
231
- MEMCPY_CSUM ( packed_buffer , user_memory , pData -> size , pConv );
232
- packed_buffer += pData -> size ;
233
- user_memory += extent ;
234
- remaining -= pData -> size ;
235
- }
236
- stack [0 ].count -= i ; /* the filled up and the entire types */
237
- stack [0 ].disp += (i * extent );
238
- stack [1 ].disp += remaining ;
239
- /* Copy the last bits */
240
- if ( 0 != remaining ) {
241
- OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , remaining , pConv -> pBaseBuf ,
242
- pData , pConv -> count );
243
- DO_DEBUG ( opal_output ( 0 , "4. pack dest %p src %p length %lu\n" ,
244
- (void * )user_memory , (void * )packed_buffer , (unsigned long )remaining ); );
245
- MEMCPY_CSUM ( packed_buffer , user_memory , remaining , pConv );
246
- user_memory += remaining ;
247
- stack [1 ].count -= remaining ;
248
- }
187
+ for ( i = 0 ; pData -> size <= remaining ; i ++ ) {
188
+ OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , pData -> size , pConv -> pBaseBuf ,
189
+ pData , pConv -> count );
190
+ DO_DEBUG ( opal_output ( 0 , "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n" ,
191
+ (void * )user_memory , (void * )packed_buffer , pData -> size , remaining , iov [idx ].iov_len ); );
192
+ MEMCPY_CSUM ( packed_buffer , user_memory , pData -> size , pConv );
193
+ packed_buffer += pData -> size ;
194
+ user_memory += extent ;
195
+ remaining -= pData -> size ;
196
+ }
197
+ stack [0 ].count -= i ; /* the entire datatype copied above */
198
+ stack [0 ].disp += (i * extent );
199
+
200
+ /* Copy the last bits */
201
+ if ( 0 != remaining ) {
202
+ OPAL_DATATYPE_SAFEGUARD_POINTER ( user_memory , remaining , pConv -> pBaseBuf ,
203
+ pData , pConv -> count );
204
+ DO_DEBUG ( opal_output ( 0 , "4. pack dest %p src %p length %" PRIsize_t "\n" ,
205
+ (void * )user_memory , (void * )packed_buffer , remaining ); );
206
+ MEMCPY_CSUM ( packed_buffer , user_memory , remaining , pConv );
207
+ stack [1 ].count -= remaining ;
208
+ stack [1 ].disp += remaining ; /* keep the += in case we are copying less that the datatype size */
249
209
if ( 0 == stack [1 ].count ) { /* prepare for the next element */
250
210
stack [1 ].count = pData -> size ;
251
211
stack [1 ].disp = 0 ;
252
212
}
253
213
}
254
- pConv -> bConverted += bConverted ;
255
214
}
256
- * out_size = iov_count ;
257
- * max_data = (pConv -> bConverted - initial_bytes_converted );
258
- if ( pConv -> bConverted == pConv -> local_size ) {
259
- pConv -> flags |= CONVERTOR_COMPLETED ;
260
- return 1 ;
261
- }
262
- return 0 ;
215
+
216
+ update_status_and_return :
217
+ * out_size = idx ;
218
+ * max_data = pConv -> bConverted - initial_bytes_converted ;
219
+ if ( pConv -> bConverted == pConv -> local_size ) pConv -> flags |= CONVERTOR_COMPLETED ;
220
+ return !!(pConv -> flags & CONVERTOR_COMPLETED ); /* done or not */
263
221
}
264
222
265
223
/* The pack/unpack functions need a cleanup. I have to create a proper interface to access
0 commit comments