Skip to content

Commit a7e3de6

Browse files
committed
coll-portals4: No more messages passed to Portals4 bigger than the limit given by PtlNIInit
1 parent 175e6aa commit a7e3de6

File tree

5 files changed

+191
-66
lines changed

5 files changed

+191
-66
lines changed

ompi/mca/coll/portals4/coll_portals4.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ struct mca_coll_portals4_component_t {
6565
opal_free_list_t requests; /* request free list for the i collectives */
6666

6767
ptl_ni_limits_t ni_limits;
68+
ptl_size_t portals_max_msg_size;
6869

6970
int use_binomial_gather_algorithm;
7071

ompi/mca/coll/portals4/coll_portals4_bcast.c

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,20 @@ static int prepare_bcast_data (struct ompi_communicator_t *comm,
8989
}
9090

9191
/* Number of segments */
92-
request->u.bcast.segment_nb = (request->u.bcast.tmpsize > COLL_PORTALS4_MAX_BW) ?
93-
(((request->u.bcast.tmpsize + COLL_PORTALS4_MAX_BW -1) / COLL_PORTALS4_MAX_BW) < COLL_PORTALS4_MAX_SEGMENT ?
94-
((request->u.bcast.tmpsize + COLL_PORTALS4_MAX_BW -1) / COLL_PORTALS4_MAX_BW) :
95-
COLL_PORTALS4_MAX_SEGMENT) :
92+
{
93+
size_t max_msg_size = (COLL_PORTALS4_MAX_BW > mca_coll_portals4_component.ni_limits.max_msg_size) ?
94+
mca_coll_portals4_component.ni_limits.max_msg_size :
95+
COLL_PORTALS4_MAX_BW;
96+
97+
//TODO : Either make compatible Portals size limits and COLL_PORTALS4_MAX_SEGMENT or remove COLL_PORTALS4_MAX_SEGMENT
98+
request->u.bcast.segment_nb = (request->u.bcast.tmpsize > max_msg_size) ?
99+
(((request->u.bcast.tmpsize + max_msg_size -1) / max_msg_size) < COLL_PORTALS4_MAX_SEGMENT ?
100+
((request->u.bcast.tmpsize + max_msg_size -1) / max_msg_size) : COLL_PORTALS4_MAX_SEGMENT) :
96101
1;
97102

103+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
104+
"seg_number=%d , seg_size_max=%lu", request->u.bcast.segment_nb, max_msg_size));
105+
}
98106
if (request->u.bcast.segment_nb > COLL_PORTALS4_BCAST_ALGO_THRESHOLD) {
99107
request->u.bcast.algo = OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO;
100108
}
@@ -361,6 +369,8 @@ bcast_kary_tree_top(void *buff, int count,
361369
/* Divide buffer into segments */
362370
if (seg <= nb_long) length = seg_size + 1;
363371
else length = seg_size;
372+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
373+
"bcast with k-ary tree : segment of size %ld", length);
364374

365375
/* compute the triggering threshold to send data to the children */
366376
trig_thr = segment_nb + seg - 1; /* To be sure the set of PtlTriggeredPut of DATA will be executed in order */
@@ -687,6 +697,8 @@ bcast_pipeline_top(void *buff, int count,
687697
/* Divide buffer into segments */
688698
if (seg <= nb_long) length = seg_size + 1;
689699
else length = seg_size;
700+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
701+
"bcast with pipeline : segment of size %ld \n", length);
690702

691703
/* compute the triggering threshold to send data to the children */
692704
trig_thr = segment_nb + seg - 1; /* To be sure the PtlTriggeredPut will be executed in order */

ompi/mca/coll/portals4/coll_portals4_component.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,16 @@ portals4_register(void)
211211
MCA_BASE_VAR_SCOPE_READONLY,
212212
&mca_coll_portals4_component.use_binomial_gather_algorithm);
213213

214+
mca_coll_portals4_component.portals_max_msg_size = PTL_SIZE_MAX;
215+
(void) mca_base_component_var_register(&mca_coll_portals4_component.super.collm_version,
216+
"max_msg_size",
217+
"Max size supported by portals4 (above that, a message is cut into messages less than that size)",
218+
MCA_BASE_VAR_TYPE_UNSIGNED_LONG,
219+
NULL, 0, 0,
220+
OPAL_INFO_LVL_9,
221+
MCA_BASE_VAR_SCOPE_READONLY,
222+
&mca_coll_portals4_component.portals_max_msg_size);
223+
214224
return OMPI_SUCCESS;
215225
}
216226

@@ -369,7 +379,13 @@ portals4_init_query(bool enable_progress_threads,
369379
__FILE__, __LINE__, ret);
370380
return OMPI_ERROR;
371381
}
382+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
383+
"ni_limits.max_atomic_size=%ld", mca_coll_portals4_component.ni_limits.max_atomic_size);
372384

385+
if (mca_coll_portals4_component.portals_max_msg_size < mca_coll_portals4_component.ni_limits.max_msg_size)
386+
mca_coll_portals4_component.ni_limits.max_msg_size = mca_coll_portals4_component.portals_max_msg_size;
387+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
388+
"ni_limits.max_msg_size=%lu", mca_coll_portals4_component.ni_limits.max_msg_size);
373389

374390
ret = PtlGetId(mca_coll_portals4_component.ni_h, &mca_coll_portals4_component.id);
375391
if (PTL_OK != ret) {

ompi/mca/coll/portals4/coll_portals4_gather.c

Lines changed: 98 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "coll_portals4.h"
2222
#include "coll_portals4_request.h"
2323

24+
#include <string.h> // included for ffs in get_tree_numdescendants_of
2425

2526
#undef RTR_USES_TRIGGERED_PUT
2627

@@ -55,6 +56,22 @@
5556
* |
5657
* 15
5758
*/
59+
60+
static int32_t get_tree_numdescendants_of(struct ompi_communicator_t* comm,
61+
int vrank)
62+
{
63+
int max;
64+
int size = ompi_comm_size(comm);
65+
66+
if (0 == vrank) {
67+
return size - 1;
68+
} else {
69+
max = 1 << ffs(vrank - 1);
70+
return ((vrank + max <= size ) ? max : size - vrank) -1;
71+
}
72+
73+
}
74+
5875
static ompi_coll_portals4_tree_t*
5976
ompi_coll_portals4_build_in_order_bmtree( struct ompi_communicator_t* comm,
6077
int root )
@@ -506,8 +523,10 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc
506523
int32_t expected_ops =0;
507524
int32_t expected_acks=0;
508525

526+
ptl_size_t number_of_fragment_gathered = 0;
527+
ptl_size_t number_of_fragment_send = 1;
509528

510-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
529+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
511530
"coll:portals4:gather_intra_binomial_top enter rank %d", request->u.gather.my_rank));
512531

513532
request->type = OMPI_COLL_PORTALS4_TYPE_GATHER;
@@ -579,6 +598,23 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc
579598
ret = setup_sync_handles(comm, request, portals4_module);
580599
if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
581600

601+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
602+
"%s:%d: packed_size=%lu, fragment_size=%lu",
603+
__FILE__, __LINE__, request->u.gather.packed_size, mca_coll_portals4_component.ni_limits.max_msg_size));
604+
605+
for (int i =0; i < bmtree->tree_nextsize; i++) {
606+
int child_vrank = VRANK(bmtree->tree_next[i], request->u.gather.root_rank, request->u.gather.size);
607+
int sub_tree_size = get_tree_numdescendants_of(comm, child_vrank) + 1;
608+
ptl_size_t local_number_of_fragment = ((sub_tree_size * request->u.gather.packed_size) + mca_coll_portals4_component.ni_limits.max_msg_size -1) / mca_coll_portals4_component.ni_limits.max_msg_size;
609+
610+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
611+
"%s:%d: %d is child of %d(%d) with %d descendants (nb_frag += %lu)",
612+
__FILE__, __LINE__, bmtree->tree_next[i], vrank, request->u.gather.root_rank , sub_tree_size, local_number_of_fragment));
613+
number_of_fragment_gathered += local_number_of_fragment;
614+
}
615+
616+
number_of_fragment_send = (request->u.gather.gather_bytes + mca_coll_portals4_component.ni_limits.max_msg_size -1) / mca_coll_portals4_component.ni_limits.max_msg_size;
617+
582618
/***********************************************/
583619
/* Chain the RTR and Recv-ACK to the Gather CT */
584620
/***********************************************/
@@ -603,7 +639,7 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc
603639
if (vrank == 0) {
604640
/* root, so do nothing */
605641

606-
expected_ops=bmtree->tree_nextsize; /* gather put from each child */
642+
expected_ops=number_of_fragment_gathered ; /* gather put from each child */
607643
expected_acks=0;
608644

609645
} else {
@@ -617,22 +653,32 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc
617653
__FILE__, __LINE__, vrank,
618654
remote_offset, vrank, vparent, request->u.gather.packed_size);
619655

620-
expected_ops=bmtree->tree_nextsize + 1; /* gather put from each child + a chained RTR */
656+
expected_ops=number_of_fragment_gathered + 1; /* gather puts from each child + a chained RTR */
621657
expected_acks=1; /* Recv-ACK from parent */
622658

623-
ret = PtlTriggeredPut(request->u.gather.gather_mdh,
624-
request->u.gather.gather_offset,
625-
request->u.gather.gather_bytes,
659+
ptl_size_t size_sent = 0;
660+
ptl_size_t size_left = request->u.gather.gather_bytes;
661+
662+
for (ptl_size_t i = 0 ; i < number_of_fragment_send; i++) {
663+
ptl_size_t frag_size = (size_left > mca_coll_portals4_component.ni_limits.max_msg_size) ?
664+
mca_coll_portals4_component.ni_limits.max_msg_size:
665+
size_left;
666+
ret = PtlTriggeredPut(request->u.gather.gather_mdh,
667+
request->u.gather.gather_offset + size_sent,
668+
frag_size,
626669
PTL_NO_ACK_REQ,
627670
ompi_coll_portals4_get_peer(comm, parent),
628671
mca_coll_portals4_component.pt_idx,
629672
request->u.gather.gather_match_bits,
630-
remote_offset,
673+
remote_offset + size_sent,
631674
NULL,
632675
0,
633676
request->u.gather.gather_cth,
634677
expected_ops);
635-
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
678+
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
679+
size_left -= frag_size;
680+
size_sent += frag_size;
681+
}
636682
}
637683

638684
/************************************/
@@ -734,7 +780,7 @@ ompi_coll_portals4_gather_intra_binomial_top(const void *sbuf, int scount, struc
734780

735781
ompi_coll_portals4_destroy_tree(&(portals4_module->cached_in_order_bmtree));
736782

737-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
783+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
738784
"coll:portals4:gather_intra_binomial_top exit rank %d", request->u.gather.my_rank));
739785

740786
return OMPI_SUCCESS;
@@ -773,8 +819,9 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct
773819
int32_t expected_ops =0;
774820
int32_t expected_acks=0;
775821

822+
ptl_size_t number_of_fragment = 1;
776823

777-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
824+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
778825
"coll:portals4:gather_intra_linear_top enter rank %d", request->u.gather.my_rank));
779826

780827
request->type = OMPI_COLL_PORTALS4_TYPE_GATHER;
@@ -843,6 +890,13 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct
843890
ret = setup_sync_handles(comm, request, portals4_module);
844891
if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
845892

893+
number_of_fragment = (request->u.gather.packed_size > mca_coll_portals4_component.ni_limits.max_msg_size) ?
894+
(request->u.gather.packed_size + mca_coll_portals4_component.ni_limits.max_msg_size - 1) / mca_coll_portals4_component.ni_limits.max_msg_size :
895+
1;
896+
opal_output_verbose(90, ompi_coll_base_framework.framework_output,
897+
"%s:%d:rank %d:number_of_fragment = %lu",
898+
__FILE__, __LINE__, request->u.gather.my_rank, number_of_fragment);
899+
846900
/***********************************************/
847901
/* Chain the RTR and Recv-ACK to the Gather CT */
848902
/***********************************************/
@@ -867,11 +921,13 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct
867921
if (i_am_root) {
868922
/* root, so do nothing */
869923

870-
expected_ops=request->u.gather.size-1; /* gather put from all other ranks */
924+
expected_ops=(request->u.gather.size-1) * number_of_fragment; /* gather put from all other ranks */
871925
expected_acks=0;
872926

873927
} else {
874928
ptl_size_t remote_offset=request->u.gather.my_rank * request->u.gather.packed_size;
929+
ptl_size_t split_offset = 0;
930+
ptl_size_t size_left = request->u.gather.gather_bytes;
875931

876932
opal_output_verbose(30, ompi_coll_base_framework.framework_output,
877933
"%s:%d:rank(%d): remote_offset(%lu)=rank(%d) * packed_size(%ld)",
@@ -881,19 +937,34 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct
881937
expected_ops=1; /* chained RTR */
882938
expected_acks=1; /* Recv-ACK from root */
883939

884-
ret = PtlTriggeredPut(request->u.gather.gather_mdh,
885-
request->u.gather.gather_offset,
886-
request->u.gather.gather_bytes,
940+
for (ptl_size_t j=0; j<number_of_fragment; j++) {
941+
942+
ptl_size_t frag_size = (size_left > mca_coll_portals4_component.ni_limits.max_msg_size) ?
943+
mca_coll_portals4_component.ni_limits.max_msg_size :
944+
size_left;
945+
946+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
947+
"%s:%d:rank(%d): frag(%lu),offset_frag (%lu) frag_size(%lu)",
948+
__FILE__, __LINE__, request->u.gather.my_rank,
949+
j, split_offset, frag_size);
950+
951+
ret = PtlTriggeredPut(request->u.gather.gather_mdh,
952+
request->u.gather.gather_offset + split_offset,
953+
frag_size,
887954
PTL_NO_ACK_REQ,
888955
ompi_coll_portals4_get_peer(comm, request->u.gather.root_rank),
889956
mca_coll_portals4_component.pt_idx,
890957
request->u.gather.gather_match_bits,
891-
remote_offset,
958+
remote_offset + split_offset,
892959
NULL,
893960
0,
894961
request->u.gather.gather_cth,
895962
expected_ops);
896-
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
963+
if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }
964+
965+
size_left -= frag_size;
966+
split_offset += frag_size;
967+
}
897968
}
898969

899970
/*****************************************/
@@ -997,7 +1068,7 @@ ompi_coll_portals4_gather_intra_linear_top(const void *sbuf, int scount, struct
9971068
"completed CTWait(expected_ops=%d)\n", expected_ops);
9981069
}
9991070

1000-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1071+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
10011072
"coll:portals4:gather_intra_linear_top exit rank %d", request->u.gather.my_rank));
10021073

10031074
return OMPI_SUCCESS;
@@ -1020,7 +1091,7 @@ ompi_coll_portals4_gather_intra_binomial_bottom(struct ompi_communicator_t *comm
10201091
int ret, line;
10211092
int i;
10221093

1023-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1094+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
10241095
"coll:portals4:gather_intra_binomial_bottom enter rank %d", request->u.gather.my_rank));
10251096

10261097
ret = cleanup_gather_handles(request);
@@ -1065,7 +1136,7 @@ ompi_coll_portals4_gather_intra_binomial_bottom(struct ompi_communicator_t *comm
10651136
ompi_request_complete(&request->super, true);
10661137
OPAL_THREAD_UNLOCK(&ompi_request_lock);
10671138

1068-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1139+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
10691140
"coll:portals4:gather_intra_binomial_bottom exit rank %d", request->u.gather.my_rank));
10701141

10711142
return OMPI_SUCCESS;
@@ -1090,7 +1161,7 @@ ompi_coll_portals4_gather_intra_linear_bottom(struct ompi_communicator_t *comm,
10901161
int ret, line;
10911162
int i;
10921163

1093-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1164+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
10941165
"coll:portals4:gather_intra_linear_bottom enter rank %d", request->u.gather.my_rank));
10951166

10961167
ret = cleanup_gather_handles(request);
@@ -1128,7 +1199,7 @@ ompi_coll_portals4_gather_intra_linear_bottom(struct ompi_communicator_t *comm,
11281199
ompi_request_complete(&request->super, true);
11291200
OPAL_THREAD_UNLOCK(&ompi_request_lock);
11301201

1131-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1202+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
11321203
"coll:portals4:gather_intra_linear_bottom exit rank %d", request->u.gather.my_rank));
11331204

11341205
return OMPI_SUCCESS;
@@ -1157,7 +1228,7 @@ ompi_coll_portals4_gather_intra(const void *sbuf, int scount, struct ompi_dataty
11571228

11581229
ompi_coll_portals4_request_t *request;
11591230

1160-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1231+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
11611232
"coll:portals4:gather_intra enter rank %d", ompi_comm_rank(comm)));
11621233

11631234
/*
@@ -1204,7 +1275,7 @@ ompi_coll_portals4_gather_intra(const void *sbuf, int scount, struct ompi_dataty
12041275
*/
12051276
OMPI_COLL_PORTALS4_REQUEST_RETURN(request);
12061277

1207-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1278+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
12081279
"coll:portals4:gather_intra exit rank %d", request->u.gather.my_rank));
12091280

12101281
return OMPI_SUCCESS;
@@ -1230,7 +1301,7 @@ ompi_coll_portals4_igather_intra(const void *sbuf, int scount, struct ompi_datat
12301301

12311302
ompi_coll_portals4_request_t *request;
12321303

1233-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1304+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
12341305
"coll:portals4:igather_intra enter rank %d", ompi_comm_rank(comm)));
12351306

12361307
/*
@@ -1267,7 +1338,7 @@ ompi_coll_portals4_igather_intra(const void *sbuf, int scount, struct ompi_datat
12671338
if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
12681339
}
12691340

1270-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1341+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
12711342
"coll:portals4:igather_intra exit rank %d", request->u.gather.my_rank));
12721343

12731344
return OMPI_SUCCESS;
@@ -1286,7 +1357,7 @@ ompi_coll_portals4_igather_intra_fini(ompi_coll_portals4_request_t *request)
12861357
{
12871358
int ret, line;
12881359

1289-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1360+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
12901361
"coll:portals4:igather_intra_fini enter rank %d", request->u.gather.my_rank));
12911362

12921363
/*
@@ -1300,7 +1371,7 @@ ompi_coll_portals4_igather_intra_fini(ompi_coll_portals4_request_t *request)
13001371
if (MPI_SUCCESS != ret) { line = __LINE__; goto err_hdlr; }
13011372
}
13021373

1303-
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
1374+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
13041375
"coll:portals4:igather_intra_fini exit rank %d", request->u.gather.my_rank));
13051376

13061377
return OMPI_SUCCESS;

0 commit comments

Comments
 (0)