Skip to content

Commit 3be6052

Browse files
authored
Merge pull request #1896 from PDeveze/Patchs-on-coll-portals4
Patchs on coll portals4
2 parents 5ced037 + a7e3de6 commit 3be6052

File tree

8 files changed

+324
-103
lines changed

8 files changed

+324
-103
lines changed

ompi/mca/coll/portals4/coll_portals4.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ struct mca_coll_portals4_component_t {
6565
opal_free_list_t requests; /* request free list for the i collectives */
6666

6767
ptl_ni_limits_t ni_limits;
68+
ptl_size_t portals_max_msg_size;
6869

6970
int use_binomial_gather_algorithm;
7071

@@ -314,7 +315,7 @@ is_reduce_optimizable(struct ompi_datatype_t *dtype, size_t length, struct ompi_
314315
}
315316

316317
*ptl_dtype = ompi_coll_portals4_atomic_datatype[dtype->id];
317-
if (*ptl_dtype == COLL_PORTALS4_NO_DTYPE){
318+
if (*ptl_dtype == COLL_PORTALS4_NO_DTYPE) {
318319
opal_output_verbose(50, ompi_coll_base_framework.framework_output,
319320
"datatype %d not supported\n",
320321
dtype->id);

ompi/mca/coll/portals4/coll_portals4_allreduce.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ allreduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count,
265265
ompi_coll_portals4_get_peer(comm, child[i]),
266266
mca_coll_portals4_component.pt_idx,
267267
match_bits_rtr, 0, NULL, 0)) != PTL_OK)
268-
return opal_stderr("Put RTR failed", __FILE__, __LINE__, ret);
268+
return opal_stderr("Put RTR failed %d", __FILE__, __LINE__, ret);
269269
}
270270
}
271271
}
@@ -408,7 +408,7 @@ int ompi_coll_portals4_iallreduce_intra(const void* sendbuf, void* recvbuf, int
408408
allreduce_kary_tree_top(sendbuf, recvbuf, count,
409409
dtype, op, comm, request, portals4_module);
410410

411-
puts("iallreduce");
411+
opal_output_verbose(10, ompi_coll_base_framework.framework_output, "iallreduce");
412412
return (OMPI_SUCCESS);
413413
}
414414

ompi/mca/coll/portals4/coll_portals4_barrier.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,31 @@ barrier_hypercube_top(struct ompi_communicator_t *comm,
147147
}
148148

149149
if (is_sync) {
150-
/* Send a put to self when we've received all our messages... */
151-
ret = PtlCTWait(request->u.barrier.rtr_ct_h, num_msgs, &event);
150+
/* Each process has a pending PtlTriggeredPut. To be sure this request will be triggered, we must
151+
call PtlTriggeredCTInc twice. Otherwise, we could free the CT too early and the Put wouldn't be triggered */
152+
153+
ptl_ct_event_t ct_inc;
154+
155+
ct_inc.success = 1;
156+
ct_inc.failure = 0;
157+
158+
if ((ret = PtlTriggeredCTInc(request->u.barrier.rtr_ct_h, ct_inc,
159+
request->u.barrier.rtr_ct_h, num_msgs)) != 0) {
160+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
161+
}
162+
163+
if ((ret = PtlTriggeredCTInc(request->u.barrier.rtr_ct_h, ct_inc,
164+
request->u.barrier.rtr_ct_h, num_msgs + 1)) != 0) {
165+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
166+
}
152167

168+
ret = PtlCTWait(request->u.barrier.rtr_ct_h, num_msgs + 2, &event);
169+
if (PTL_OK != ret) {
170+
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
171+
"%s:%d: PtlCTWait failed: %d\n",
172+
__FILE__, __LINE__, ret);
173+
return OMPI_ERROR;
174+
}
153175
}
154176
else {
155177
/* Send a put to self when we've received all our messages... */

ompi/mca/coll/portals4/coll_portals4_bcast.c

Lines changed: 117 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,20 @@ static int prepare_bcast_data (struct ompi_communicator_t *comm,
8989
}
9090

9191
/* Number of segments */
92-
request->u.bcast.segment_nb = (request->u.bcast.tmpsize > COLL_PORTALS4_MAX_BW) ?
93-
(((request->u.bcast.tmpsize + COLL_PORTALS4_MAX_BW -1) / COLL_PORTALS4_MAX_BW) < COLL_PORTALS4_MAX_SEGMENT ?
94-
((request->u.bcast.tmpsize + COLL_PORTALS4_MAX_BW -1) / COLL_PORTALS4_MAX_BW) :
95-
COLL_PORTALS4_MAX_SEGMENT) :
92+
{
93+
size_t max_msg_size = (COLL_PORTALS4_MAX_BW > mca_coll_portals4_component.ni_limits.max_msg_size) ?
94+
mca_coll_portals4_component.ni_limits.max_msg_size :
95+
COLL_PORTALS4_MAX_BW;
96+
97+
//TODO : Either make compatible Portals size limits and COLL_PORTALS4_MAX_SEGMENT or remove COLL_PORTALS4_MAX_SEGMENT
98+
request->u.bcast.segment_nb = (request->u.bcast.tmpsize > max_msg_size) ?
99+
(((request->u.bcast.tmpsize + max_msg_size -1) / max_msg_size) < COLL_PORTALS4_MAX_SEGMENT ?
100+
((request->u.bcast.tmpsize + max_msg_size -1) / max_msg_size) : COLL_PORTALS4_MAX_SEGMENT) :
96101
1;
97102

103+
OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output,
104+
"seg_number=%d , seg_size_max=%lu", request->u.bcast.segment_nb, max_msg_size));
105+
}
98106
if (request->u.bcast.segment_nb > COLL_PORTALS4_BCAST_ALGO_THRESHOLD) {
99107
request->u.bcast.algo = OMPI_COLL_PORTALS4_BCAST_PIPELINE_ALGO;
100108
}
@@ -137,9 +145,9 @@ bcast_kary_tree_top(void *buff, int count,
137145
mca_coll_portals4_module_t *portals4_module)
138146
{
139147
bool is_sync = request->is_sync;
140-
int ret, seg;
141-
unsigned int i;
142-
int segment_nb = request->u.bcast.segment_nb;
148+
int ret;
149+
unsigned int i, seg, seg_size, nb_long;
150+
unsigned int segment_nb = request->u.bcast.segment_nb;
143151
unsigned int child_nb;
144152
int size = ompi_comm_size(comm);
145153
int rank = ompi_comm_rank(comm);
@@ -201,15 +209,22 @@ bcast_kary_tree_top(void *buff, int count,
201209
COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0,
202210
COLL_PORTALS4_BCAST, 0, internal_count);
203211

212+
/* The data will be cut in segment_nb segments.
213+
* nb_long segments will have a size of (seg_size + 1)
214+
* and (segment_nb - nb_long) segments will have a size of seg_size
215+
*/
216+
seg_size = request->u.bcast.tmpsize / segment_nb;
217+
nb_long = request->u.bcast.tmpsize % segment_nb;
218+
opal_output_verbose(10, ompi_coll_base_framework.framework_output, "seg_size=%d nb_long=%d segment_nb=%d", seg_size, nb_long, segment_nb);
219+
204220
if (rank != root) {
205221
for (seg = 1, offset = 0, length = 0 ;
206222
seg <= segment_nb ;
207223
seg++, offset += length) {
208224

209225
/* Divide buffer into segments */
210-
length = (seg < segment_nb) ?
211-
(request->u.bcast.tmpsize + segment_nb - 1) / segment_nb :
212-
request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1);
226+
if (seg <= nb_long) length = seg_size + 1;
227+
else length = seg_size;
213228

214229
/*
215230
** Prepare Data ME
@@ -352,13 +367,14 @@ bcast_kary_tree_top(void *buff, int count,
352367
seg++, offset += length) {
353368

354369
/* Divide buffer into segments */
355-
length = (seg < segment_nb) ?
356-
(request->u.bcast.tmpsize + segment_nb - 1) / segment_nb :
357-
request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1);
370+
if (seg <= nb_long) length = seg_size + 1;
371+
else length = seg_size;
372+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
373+
"bcast with k-ary tree : segment of size %ld", length);
358374

359375
/* compute the triggering threshold to send data to the children */
360-
trig_thr = (rank == root) ? (segment_nb) :
361-
(segment_nb + seg);
376+
trig_thr = segment_nb + seg - 1; /* To be sure the set of PtlTriggeredPut of DATA will be executed in order */
377+
if (rank != root) trig_thr ++;
362378

363379
/*
364380
** Send Data to children
@@ -381,6 +397,17 @@ bcast_kary_tree_top(void *buff, int count,
381397
}
382398
}
383399

400+
if (rank == root) {
401+
trig_thr = segment_nb;
402+
ct_inc.success = segment_nb;
403+
ct_inc.failure = 0;
404+
405+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
406+
request->u.bcast.trig_ct_h, trig_thr)) != 0) {
407+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
408+
}
409+
}
410+
384411
ack_thr = child_nb;
385412

386413
if (is_sync) {
@@ -409,9 +436,28 @@ bcast_kary_tree_top(void *buff, int count,
409436
*/
410437

411438
if (rank != root) {
412-
ack_thr = segment_nb;
439+
trig_thr = segment_nb;
413440
if (is_sync) {
414-
if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, ack_thr, &ct)) != 0) {
441+
/* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
442+
Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
443+
444+
This is necessary because portals4 does not insure the order in the triggered operations associated
445+
with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
446+
447+
ct_inc.success = 1;
448+
ct_inc.failure = 0;
449+
450+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
451+
request->u.bcast.trig_ct_h, trig_thr)) != 0) {
452+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
453+
}
454+
455+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
456+
request->u.bcast.trig_ct_h, trig_thr + 1)) != 0) {
457+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
458+
}
459+
460+
if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr + 2, &ct)) != 0) {
415461
opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret);
416462
}
417463
}
@@ -421,7 +467,7 @@ bcast_kary_tree_top(void *buff, int count,
421467
mca_coll_portals4_component.finish_pt_idx,
422468
0, 0, NULL, (uintptr_t) request,
423469
request->u.bcast.trig_ct_h,
424-
ack_thr)) != 0) {
470+
trig_thr)) != 0) {
425471
return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
426472
}
427473

@@ -440,8 +486,9 @@ bcast_pipeline_top(void *buff, int count,
440486
mca_coll_portals4_module_t *portals4_module)
441487
{
442488
bool is_sync = request->is_sync;
443-
int ret, seg;
444-
int segment_nb = request->u.bcast.segment_nb;
489+
int ret;
490+
unsigned int seg, seg_size, nb_long;
491+
unsigned int segment_nb = request->u.bcast.segment_nb;
445492
int size = ompi_comm_size(comm);
446493
int rank = ompi_comm_rank(comm);
447494
ptl_rank_t parent, child;
@@ -492,16 +539,22 @@ bcast_pipeline_top(void *buff, int count,
492539

493540
COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, 0,
494541
COLL_PORTALS4_BCAST, 0, internal_count);
542+
/* The data will be cut in segment_nb segments.
543+
* nb_long segments will have a size of (seg_size + 1)
544+
* and (segment_nb - nb_long) segments will have a size of seg_size
545+
*/
546+
seg_size = request->u.bcast.tmpsize / segment_nb;
547+
nb_long = request->u.bcast.tmpsize % segment_nb;
548+
opal_output_verbose(10, ompi_coll_base_framework.framework_output, "seg_size=%d nb_long=%d", seg_size, nb_long);
495549

496550
if (rank != root) {
497551
for (seg = 1, offset = 0, length = 0 ;
498552
seg <= segment_nb ;
499553
seg++, offset += length) {
500554

501555
/* Divide buffer into segments */
502-
length = (seg < segment_nb) ?
503-
(request->u.bcast.tmpsize + segment_nb - 1) / segment_nb :
504-
request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1);
556+
if (seg <= nb_long) length = seg_size + 1;
557+
else length = seg_size;
505558

506559
/*
507560
** Prepare Data ME
@@ -642,13 +695,14 @@ bcast_pipeline_top(void *buff, int count,
642695
seg++, offset += length) {
643696

644697
/* Divide buffer into segments */
645-
length = (seg < segment_nb) ?
646-
(request->u.bcast.tmpsize + segment_nb - 1) / segment_nb :
647-
request->u.bcast.tmpsize - ((request->u.bcast.tmpsize + segment_nb - 1) / segment_nb) * (segment_nb - 1);
698+
if (seg <= nb_long) length = seg_size + 1;
699+
else length = seg_size;
700+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
701+
"bcast with pipeline : segment of size %ld \n", length);
648702

649703
/* compute the triggering threshold to send data to the children */
650-
trig_thr = (rank == root) ? (segment_nb) :
651-
(segment_nb + seg);
704+
trig_thr = segment_nb + seg - 1; /* To be sure the PtlTriggeredPut will be executed in order */
705+
if (rank != root) trig_thr ++;
652706

653707
/*
654708
** Send Data to children
@@ -668,6 +722,16 @@ bcast_pipeline_top(void *buff, int count,
668722
}
669723
}
670724
}
725+
if (rank == root) {
726+
trig_thr = segment_nb;
727+
ct_inc.success = segment_nb;
728+
ct_inc.failure = 0;
729+
730+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
731+
request->u.bcast.trig_ct_h, trig_thr)) != 0) {
732+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
733+
}
734+
}
671735

672736
if (is_sync) {
673737
if ((ret = PtlCTWait(request->u.bcast.ack_ct_h, 1, &ct)) != 0) {
@@ -696,8 +760,29 @@ bcast_pipeline_top(void *buff, int count,
696760
*/
697761

698762
if (rank != root) {
763+
trig_thr = segment_nb;
764+
699765
if (is_sync) {
700-
if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, segment_nb, &ct)) != 0) {
766+
/* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
767+
Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
768+
769+
This is necessary because portals4 does not insure the order in the triggered operations associated
770+
with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
771+
772+
ct_inc.success = 1;
773+
ct_inc.failure = 0;
774+
775+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
776+
request->u.bcast.trig_ct_h, trig_thr)) != 0) {
777+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
778+
}
779+
780+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
781+
request->u.bcast.trig_ct_h, trig_thr + 1)) != 0) {
782+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
783+
}
784+
785+
if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr + 2, &ct)) != 0) {
701786
opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret);
702787
}
703788
}
@@ -707,7 +792,7 @@ bcast_pipeline_top(void *buff, int count,
707792
mca_coll_portals4_component.finish_pt_idx,
708793
0, 0, NULL, (uintptr_t) request,
709794
request->u.bcast.trig_ct_h,
710-
segment_nb)) != 0) {
795+
trig_thr)) != 0) {
711796
return opal_stderr("PtlTriggeredPut failed", __FILE__, __LINE__, ret);
712797
}
713798
}
@@ -831,7 +916,7 @@ ompi_coll_portals4_ibcast_intra(void *buff, int count,
831916
return OMPI_ERROR;
832917
}
833918

834-
puts("ibcast");
919+
opal_output_verbose(10, ompi_coll_base_framework.framework_output, "ibcast_intra");
835920
return (OMPI_SUCCESS);
836921
}
837922

@@ -860,5 +945,6 @@ ompi_coll_portals4_ibcast_intra_fini(ompi_coll_portals4_request_t *request)
860945
ompi_request_complete(&request->super, true);
861946
OPAL_THREAD_UNLOCK(&ompi_request_lock);
862947

948+
opal_output_verbose(10, ompi_coll_base_framework.framework_output, "ibcast_intra_fini");
863949
return (OMPI_SUCCESS);
864950
}

ompi/mca/coll/portals4/coll_portals4_component.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,16 @@ portals4_register(void)
211211
MCA_BASE_VAR_SCOPE_READONLY,
212212
&mca_coll_portals4_component.use_binomial_gather_algorithm);
213213

214+
mca_coll_portals4_component.portals_max_msg_size = PTL_SIZE_MAX;
215+
(void) mca_base_component_var_register(&mca_coll_portals4_component.super.collm_version,
216+
"max_msg_size",
217+
"Max size supported by portals4 (above that, a message is cut into messages less than that size)",
218+
MCA_BASE_VAR_TYPE_UNSIGNED_LONG,
219+
NULL, 0, 0,
220+
OPAL_INFO_LVL_9,
221+
MCA_BASE_VAR_SCOPE_READONLY,
222+
&mca_coll_portals4_component.portals_max_msg_size);
223+
214224
return OMPI_SUCCESS;
215225
}
216226

@@ -369,7 +379,13 @@ portals4_init_query(bool enable_progress_threads,
369379
__FILE__, __LINE__, ret);
370380
return OMPI_ERROR;
371381
}
382+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
383+
"ni_limits.max_atomic_size=%ld", mca_coll_portals4_component.ni_limits.max_atomic_size);
372384

385+
if (mca_coll_portals4_component.portals_max_msg_size < mca_coll_portals4_component.ni_limits.max_msg_size)
386+
mca_coll_portals4_component.ni_limits.max_msg_size = mca_coll_portals4_component.portals_max_msg_size;
387+
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
388+
"ni_limits.max_msg_size=%lu", mca_coll_portals4_component.ni_limits.max_msg_size);
373389

374390
ret = PtlGetId(mca_coll_portals4_component.ni_h, &mca_coll_portals4_component.id);
375391
if (PTL_OK != ret) {

0 commit comments

Comments
 (0)