Skip to content

Commit 175e6aa

Browse files
committed
coll-portals4: Before calling PtlCTWait, call PtlTriggeredInc twice so be sure all pending PtlTriggredPut are triggered
1 parent df59d6c commit 175e6aa

File tree

2 files changed

+90
-12
lines changed

2 files changed

+90
-12
lines changed

ompi/mca/coll/portals4/coll_portals4_barrier.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,31 @@ barrier_hypercube_top(struct ompi_communicator_t *comm,
147147
}
148148

149149
if (is_sync) {
150-
/* Send a put to self when we've received all our messages... */
151-
ret = PtlCTWait(request->u.barrier.rtr_ct_h, num_msgs, &event);
150+
/* Each process has a pending PtlTriggeredPut. To be sure this request will be triggered, we must
151+
call PtlTriggeredCTInc twice. Otherwise, we could free the CT too early and the Put wouldn't be triggered */
152+
153+
ptl_ct_event_t ct_inc;
154+
155+
ct_inc.success = 1;
156+
ct_inc.failure = 0;
157+
158+
if ((ret = PtlTriggeredCTInc(request->u.barrier.rtr_ct_h, ct_inc,
159+
request->u.barrier.rtr_ct_h, num_msgs)) != 0) {
160+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
161+
}
162+
163+
if ((ret = PtlTriggeredCTInc(request->u.barrier.rtr_ct_h, ct_inc,
164+
request->u.barrier.rtr_ct_h, num_msgs + 1)) != 0) {
165+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
166+
}
152167

168+
ret = PtlCTWait(request->u.barrier.rtr_ct_h, num_msgs + 2, &event);
169+
if (PTL_OK != ret) {
170+
opal_output_verbose(1, ompi_coll_base_framework.framework_output,
171+
"%s:%d: PtlCTWait failed: %d\n",
172+
__FILE__, __LINE__, ret);
173+
return OMPI_ERROR;
174+
}
153175
}
154176
else {
155177
/* Send a put to self when we've received all our messages... */

ompi/mca/coll/portals4/coll_portals4_bcast.c

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -361,12 +361,10 @@ bcast_kary_tree_top(void *buff, int count,
361361
/* Divide buffer into segments */
362362
if (seg <= nb_long) length = seg_size + 1;
363363
else length = seg_size;
364-
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
365-
"bcast with k-ary tree : segment of size %ld", length);
366364

367365
/* compute the triggering threshold to send data to the children */
368-
trig_thr = (rank == root) ? (segment_nb) :
369-
(segment_nb + seg);
366+
trig_thr = segment_nb + seg - 1; /* To be sure the set of PtlTriggeredPut of DATA will be executed in order */
367+
if (rank != root) trig_thr ++;
370368

371369
/*
372370
** Send Data to children
@@ -389,6 +387,17 @@ bcast_kary_tree_top(void *buff, int count,
389387
}
390388
}
391389

390+
if (rank == root) {
391+
trig_thr = segment_nb;
392+
ct_inc.success = segment_nb;
393+
ct_inc.failure = 0;
394+
395+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
396+
request->u.bcast.trig_ct_h, trig_thr)) != 0) {
397+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
398+
}
399+
}
400+
392401
ack_thr = child_nb;
393402

394403
if (is_sync) {
@@ -419,7 +428,26 @@ bcast_kary_tree_top(void *buff, int count,
419428
if (rank != root) {
420429
trig_thr = segment_nb;
421430
if (is_sync) {
422-
if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr, &ct)) != 0) {
431+
/* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
432+
Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
433+
434+
This is necessary because portals4 does not insure the order in the triggered operations associated
435+
with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
436+
437+
ct_inc.success = 1;
438+
ct_inc.failure = 0;
439+
440+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
441+
request->u.bcast.trig_ct_h, trig_thr)) != 0) {
442+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
443+
}
444+
445+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
446+
request->u.bcast.trig_ct_h, trig_thr + 1)) != 0) {
447+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
448+
}
449+
450+
if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr + 2, &ct)) != 0) {
423451
opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret);
424452
}
425453
}
@@ -659,12 +687,10 @@ bcast_pipeline_top(void *buff, int count,
659687
/* Divide buffer into segments */
660688
if (seg <= nb_long) length = seg_size + 1;
661689
else length = seg_size;
662-
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
663-
"bcast with pipeline : segment of size %ld \n", length);
664690

665691
/* compute the triggering threshold to send data to the children */
666-
trig_thr = (rank == root) ? (segment_nb) :
667-
(segment_nb + seg);
692+
trig_thr = segment_nb + seg - 1; /* To be sure the PtlTriggeredPut will be executed in order */
693+
if (rank != root) trig_thr ++;
668694

669695
/*
670696
** Send Data to children
@@ -684,6 +710,16 @@ bcast_pipeline_top(void *buff, int count,
684710
}
685711
}
686712
}
713+
if (rank == root) {
714+
trig_thr = segment_nb;
715+
ct_inc.success = segment_nb;
716+
ct_inc.failure = 0;
717+
718+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
719+
request->u.bcast.trig_ct_h, trig_thr)) != 0) {
720+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
721+
}
722+
}
687723

688724
if (is_sync) {
689725
if ((ret = PtlCTWait(request->u.bcast.ack_ct_h, 1, &ct)) != 0) {
@@ -713,8 +749,28 @@ bcast_pipeline_top(void *buff, int count,
713749

714750
if (rank != root) {
715751
trig_thr = segment_nb;
752+
716753
if (is_sync) {
717-
if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr, &ct)) != 0) {
754+
/* Each leaf has a pending PtlTriggeredPut (to send the final ACK). We must call PtlTriggeredCTInc twice.
755+
Otherwise, we could pass the PtlCTWait and then free the CT too early and the Put wouldn't be triggered.
756+
757+
This is necessary because portals4 does not insure the order in the triggered operations associated
758+
with the same threshold. In the case where PtlCTWait is not called (else case), this is not necessary. */
759+
760+
ct_inc.success = 1;
761+
ct_inc.failure = 0;
762+
763+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
764+
request->u.bcast.trig_ct_h, trig_thr)) != 0) {
765+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
766+
}
767+
768+
if ((ret = PtlTriggeredCTInc(request->u.bcast.trig_ct_h, ct_inc,
769+
request->u.bcast.trig_ct_h, trig_thr + 1)) != 0) {
770+
return opal_stderr("PtlTriggeredCTInc failed", __FILE__, __LINE__, ret);
771+
}
772+
773+
if ((ret = PtlCTWait(request->u.bcast.trig_ct_h, trig_thr + 2, &ct)) != 0) {
718774
opal_stderr("PtlCTWait failed", __FILE__, __LINE__, ret);
719775
}
720776
}

0 commit comments

Comments
 (0)