Skip to content

Commit d242a95

Browse files
committed
reduce the communication volume during MPI_File_set_view
1 parent 13c6b24 commit d242a95

File tree

1 file changed

+78
-167
lines changed

1 file changed

+78
-167
lines changed

ompi/mca/io/ompio/io_ompio_file_set_view.c

Lines changed: 78 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,10 @@ int mca_io_ompio_set_view_internal(mca_io_ompio_file_t *fh,
154154
free(contg_groups);
155155
return OMPI_ERROR;
156156
}
157-
if( !( (fh->f_comm->c_flags & OMPI_COMM_CART) &&
158-
(num_groups == 1 || num_groups == fh->f_size)) ) {
159-
mca_io_ompio_finalize_initial_grouping(fh,
160-
num_groups,
161-
contg_groups);
162-
}
157+
158+
mca_io_ompio_finalize_initial_grouping(fh,
159+
num_groups,
160+
contg_groups);
163161
for( i = 0; i < fh->f_size; i++){
164162
free(contg_groups[i].procs_in_contg_group);
165163
}
@@ -230,7 +228,7 @@ int mca_io_ompio_file_get_view (struct ompi_file_t *fp,
230228

231229
OMPI_MPI_OFFSET_TYPE get_contiguous_chunk_size (mca_io_ompio_file_t *fh)
232230
{
233-
int uniform = 0, global_uniform = 0;
231+
int uniform = 0;
234232
OMPI_MPI_OFFSET_TYPE avg[3] = {0,0,0};
235233
OMPI_MPI_OFFSET_TYPE global_avg[3] = {0,0,0};
236234
int i = 0;
@@ -267,6 +265,10 @@ OMPI_MPI_OFFSET_TYPE get_contiguous_chunk_size (mca_io_ompio_file_t *fh)
267265
global_avg[0] = global_avg[0]/fh->f_size;
268266
global_avg[1] = global_avg[1]/fh->f_size;
269267

268+
#if 0
269+
/* Disabling the feature since we are not using it anyway. Saves us one allreduce operation. */
270+
int global_uniform=0;
271+
270272
if ( global_avg[0] == avg[0] &&
271273
global_avg[1] == avg[1] &&
272274
0 == avg[2] &&
@@ -292,7 +294,7 @@ OMPI_MPI_OFFSET_TYPE get_contiguous_chunk_size (mca_io_ompio_file_t *fh)
292294
/* yes, everybody agrees on having a uniform file view */
293295
fh->f_flags |= OMPIO_UNIFORM_FVIEW;
294296
}
295-
297+
#endif
296298
return global_avg[0];
297299
}
298300

@@ -319,84 +321,64 @@ int mca_io_ompio_fview_based_grouping(mca_io_ompio_file_t *fh,
319321
}
320322
start_offset_len[2] = fh->f_rank;
321323

322-
if( OMPIO_ROOT == fh->f_rank){
323-
start_offsets_lens = (OMPI_MPI_OFFSET_TYPE* )malloc (3 * fh->f_size * sizeof(OMPI_MPI_OFFSET_TYPE));
324-
if (NULL == start_offsets_lens) {
325-
opal_output (1, "OUT OF MEMORY\n");
326-
return OMPI_ERR_OUT_OF_RESOURCE;
327-
}
328-
end_offsets = (OMPI_MPI_OFFSET_TYPE* )malloc (fh->f_size * sizeof(OMPI_MPI_OFFSET_TYPE));
329-
if (NULL == end_offsets) {
330-
opal_output (1, "OUT OF MEMORY\n");
331-
free(start_offsets_lens);
332-
return OMPI_ERR_OUT_OF_RESOURCE;
333-
}
334-
324+
start_offsets_lens = (OMPI_MPI_OFFSET_TYPE* )malloc (3 * fh->f_size * sizeof(OMPI_MPI_OFFSET_TYPE));
325+
if (NULL == start_offsets_lens) {
326+
opal_output (1, "OUT OF MEMORY\n");
327+
return OMPI_ERR_OUT_OF_RESOURCE;
335328
}
336-
//Gather start offsets across processes in a group on aggregator
337-
fh->f_comm->c_coll.coll_gather (start_offset_len,
338-
3,
339-
OMPI_OFFSET_DATATYPE,
340-
start_offsets_lens,
341-
3,
342-
OMPI_OFFSET_DATATYPE,
343-
OMPIO_ROOT,
344-
fh->f_comm,
345-
fh->f_comm->c_coll.coll_gather_module);
346-
329+
end_offsets = (OMPI_MPI_OFFSET_TYPE* )malloc (fh->f_size * sizeof(OMPI_MPI_OFFSET_TYPE));
330+
if (NULL == end_offsets) {
331+
opal_output (1, "OUT OF MEMORY\n");
332+
free(start_offsets_lens);
333+
return OMPI_ERR_OUT_OF_RESOURCE;
334+
}
335+
336+
//Allgather start offsets across processes in a group on aggregator
337+
fh->f_comm->c_coll.coll_allgather (start_offset_len,
338+
3,
339+
OMPI_OFFSET_DATATYPE,
340+
start_offsets_lens,
341+
3,
342+
OMPI_OFFSET_DATATYPE,
343+
fh->f_comm,
344+
fh->f_comm->c_coll.coll_allgather_module);
345+
347346
//Calculate contg chunk size and contg subgroups
348-
if(OMPIO_ROOT == fh->f_rank){
349-
for( k = 0 ; k < fh->f_size; k++){
350-
end_offsets[k] = start_offsets_lens[3*k] + start_offsets_lens[3*k+1];
351-
contg_groups[k].contg_chunk_size = 0;
347+
for( k = 0 ; k < fh->f_size; k++){
348+
end_offsets[k] = start_offsets_lens[3*k] + start_offsets_lens[3*k+1];
349+
contg_groups[k].contg_chunk_size = 0;
350+
}
351+
k = 0;
352+
while( k < fh->f_size){
353+
if( k == 0){
354+
contg_groups[p].contg_chunk_size += start_offsets_lens[3*k+1];
355+
contg_groups[p].procs_in_contg_group[g] = start_offsets_lens[3*k + 2];
356+
g++;
357+
contg_groups[p].procs_per_contg_group = g;
358+
k++;
352359
}
353-
k = 0;
354-
while( k < fh->f_size){
355-
if( k == 0){
356-
contg_groups[p].contg_chunk_size += start_offsets_lens[3*k+1];
357-
contg_groups[p].procs_in_contg_group[g] = start_offsets_lens[3*k + 2];
358-
g++;
359-
contg_groups[p].procs_per_contg_group = g;
360-
k++;
361-
}
362-
else if( start_offsets_lens[3*k] == end_offsets[k - 1] ){
363-
contg_groups[p].contg_chunk_size += start_offsets_lens[3*k+1];
364-
contg_groups[p].procs_in_contg_group[g] = start_offsets_lens[3*k + 2];
365-
g++;
366-
contg_groups[p].procs_per_contg_group = g;
367-
k++;
368-
}
369-
else{
370-
p++;
371-
g = 0;
372-
contg_groups[p].contg_chunk_size += start_offsets_lens[3*k+1];
373-
contg_groups[p].procs_in_contg_group[g] = start_offsets_lens[3*k + 2];
374-
g++;
375-
contg_groups[p].procs_per_contg_group = g;
376-
k++;
377-
}
360+
else if( start_offsets_lens[3*k] == end_offsets[k - 1] ){
361+
contg_groups[p].contg_chunk_size += start_offsets_lens[3*k+1];
362+
contg_groups[p].procs_in_contg_group[g] = start_offsets_lens[3*k + 2];
363+
g++;
364+
contg_groups[p].procs_per_contg_group = g;
365+
k++;
366+
}
367+
else{
368+
p++;
369+
g = 0;
370+
contg_groups[p].contg_chunk_size += start_offsets_lens[3*k+1];
371+
contg_groups[p].procs_in_contg_group[g] = start_offsets_lens[3*k + 2];
372+
g++;
373+
contg_groups[p].procs_per_contg_group = g;
374+
k++;
378375
}
379-
380-
*num_groups = p+1;
381-
if (NULL != start_offsets_lens) {
382-
free (start_offsets_lens);
383-
start_offsets_lens = NULL;
384-
}
385-
if (NULL != end_offsets) {
386-
free (end_offsets);
387-
end_offsets = NULL;
388-
}
389376
}
390-
391-
//bcast num_groups to all procs
392-
fh->f_comm->c_coll.coll_bcast (num_groups,
393-
1,
394-
MPI_INT,
395-
OMPIO_ROOT,
396-
fh->f_comm,
397-
fh->f_comm->c_coll.coll_bcast_module);
398-
399-
377+
378+
*num_groups = p+1;
379+
free (start_offsets_lens);
380+
free (end_offsets);
381+
400382
return OMPI_SUCCESS;
401383
}
402384

@@ -407,105 +389,34 @@ int mca_io_ompio_finalize_initial_grouping(mca_io_ompio_file_t *fh,
407389

408390
int z = 0;
409391
int y = 0;
410-
int r = 0;
411-
412-
MPI_Request *sendreq = NULL , *req = NULL;
413-
414-
415-
req = (MPI_Request *)malloc (2* sizeof(MPI_Request));
416-
if (NULL == req) {
417-
return OMPI_ERR_OUT_OF_RESOURCE;
418-
}
419392

420393
fh->f_init_num_aggrs = num_groups;
421394
fh->f_init_aggr_list = (int*)malloc (fh->f_init_num_aggrs * sizeof(int));
422395
if (NULL == fh->f_init_aggr_list) {
423396
opal_output (1, "OUT OF MEMORY\n");
424-
free(req);
425397
return OMPI_ERR_OUT_OF_RESOURCE;
426398
}
427399

428-
if(OMPIO_ROOT == fh->f_rank){
429-
sendreq = (MPI_Request *)malloc ( 2 *fh->f_size * sizeof(MPI_Request));
430-
if (NULL == sendreq) {
431-
free(req);
432-
return OMPI_ERR_OUT_OF_RESOURCE;
433-
}
434-
435-
for( z = 0 ;z < num_groups; z++){
436-
for( y = 0; y < contg_groups[z].procs_per_contg_group; y++){
437-
MCA_PML_CALL(isend(&contg_groups[z].procs_per_contg_group,
438-
1,
439-
MPI_INT,
440-
contg_groups[z].procs_in_contg_group[y],
441-
OMPIO_PROCS_PER_GROUP_TAG,
442-
MCA_PML_BASE_SEND_STANDARD,
443-
fh->f_comm,
444-
&sendreq[r++]));
445-
446-
//send initial grouping distribution to all processes in the group
447-
MCA_PML_CALL(isend(contg_groups[z].procs_in_contg_group,
448-
contg_groups[z].procs_per_contg_group,
449-
MPI_INT,
450-
contg_groups[z].procs_in_contg_group[y],
451-
OMPIO_PROCS_IN_GROUP_TAG,
452-
MCA_PML_BASE_SEND_STANDARD,
453-
fh->f_comm,
454-
&sendreq[r++]));
455-
}
456-
}
457-
}
458-
459-
//All processes receive initial procs per group from OMPIO_ROOT
460-
MCA_PML_CALL(irecv(&fh->f_init_procs_per_group,
461-
1,
462-
MPI_INT,
463-
OMPIO_ROOT,
464-
OMPIO_PROCS_PER_GROUP_TAG,
465-
fh->f_comm,
466-
&req[0]));
467-
468-
ompi_request_wait (&req[0], MPI_STATUS_IGNORE);
469-
fh->f_init_procs_in_group = (int*)malloc (fh->f_init_procs_per_group * sizeof(int));
470-
if (NULL == fh->f_init_procs_in_group) {
471-
opal_output (1, "OUT OF MEMORY\n");
472-
free(req);
473-
if (NULL != sendreq) {
474-
free(sendreq);
400+
for( z = 0 ;z < num_groups; z++){
401+
for( y = 0; y < contg_groups[z].procs_per_contg_group; y++){
402+
if ( fh->f_rank == contg_groups[z].procs_in_contg_group[y] ) {
403+
fh->f_init_procs_per_group = contg_groups[z].procs_per_contg_group;
404+
fh->f_init_procs_in_group = (int*)malloc (fh->f_init_procs_per_group * sizeof(int));
405+
if (NULL == fh->f_init_procs_in_group) {
406+
opal_output (1, "OUT OF MEMORY\n");
407+
return OMPI_ERR_OUT_OF_RESOURCE;
408+
}
409+
memcpy ( fh->f_init_procs_in_group, contg_groups[z].procs_in_contg_group,
410+
contg_groups[z].procs_per_contg_group * sizeof (int));
411+
412+
}
475413
}
476-
return OMPI_ERR_OUT_OF_RESOURCE;
477414
}
478-
//All processes receive initial process distribution from OMPIO_ROOT
479-
MCA_PML_CALL(irecv(fh->f_init_procs_in_group,
480-
fh->f_init_procs_per_group,
481-
MPI_INT,
482-
OMPIO_ROOT,
483-
OMPIO_PROCS_IN_GROUP_TAG,
484-
fh->f_comm,
485-
&req[1]));
486-
487-
ompi_request_wait (&req[1], MPI_STATUS_IGNORE);
488-
free (req);
489-
if(OMPIO_ROOT == fh->f_rank){
490-
ompi_request_wait_all (r, sendreq, MPI_STATUSES_IGNORE);
491-
free (sendreq);
492-
}
493-
494415

495-
/*set initial aggregator list */
496-
//OMPIO_ROOT broadcasts aggr list
497-
if(OMPIO_ROOT == fh->f_rank){
498-
for( z = 0 ;z < num_groups; z++){
499-
fh->f_init_aggr_list[z] = contg_groups[z].procs_in_contg_group[0];
500-
}
416+
for( z = 0 ;z < num_groups; z++){
417+
fh->f_init_aggr_list[z] = contg_groups[z].procs_in_contg_group[0];
501418
}
502419

503-
fh->f_comm->c_coll.coll_bcast (fh->f_init_aggr_list,
504-
num_groups,
505-
MPI_INT,
506-
OMPIO_ROOT,
507-
fh->f_comm,
508-
fh->f_comm->c_coll.coll_bcast_module);
509420

510421
return OMPI_SUCCESS;
511422
}

0 commit comments

Comments
 (0)