@@ -70,6 +70,7 @@ using dpctl::utils::keep_args_alive;
70
70
71
71
std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing (
72
72
sycl::queue exec_q,
73
+ std::vector<sycl::event> &host_task_events,
73
74
py::ssize_t *device_orthog_shapes_strides,
74
75
py::ssize_t *device_axes_shapes_strides,
75
76
const py::ssize_t *inp_shape,
@@ -210,20 +211,21 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
210
211
exec_q.copy <py::ssize_t >(packed_host_shapes_strides_shp->data (),
211
212
device_orthog_shapes_strides,
212
213
packed_host_shapes_strides_shp->size ());
213
- exec_q.submit ([&](sycl::handler &cgh) {
214
- cgh.depends_on (device_orthog_shapes_strides_copy_ev);
215
- cgh.host_task ([packed_host_shapes_strides_shp] {});
216
- });
217
214
218
215
sycl::event device_axes_shapes_strides_copy_ev =
219
216
exec_q.copy <py::ssize_t >(
220
217
packed_host_axes_shapes_strides_shp->data (),
221
218
device_axes_shapes_strides,
222
219
packed_host_axes_shapes_strides_shp->size ());
223
- exec_q.submit ([&](sycl::handler &cgh) {
224
- cgh.depends_on (device_axes_shapes_strides_copy_ev);
225
- cgh.host_task ([packed_host_axes_shapes_strides_shp]() {});
226
- });
220
+
221
+ sycl::event clean_up_host_task_ev =
222
+ exec_q.submit ([&](sycl::handler &cgh) {
223
+ cgh.depends_on (device_axes_shapes_strides_copy_ev);
224
+ cgh.depends_on (device_orthog_shapes_strides_copy_ev);
225
+ cgh.host_task ([packed_host_axes_shapes_strides_shp,
226
+ packed_host_shapes_strides_shp]() {});
227
+ });
228
+ host_task_events.push_back (clean_up_host_task_ev);
227
229
228
230
std::vector<sycl::event> v = {device_orthog_shapes_strides_copy_ev,
229
231
device_axes_shapes_strides_copy_ev};
@@ -268,10 +270,13 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
268
270
packed_host_axes_shapes_strides_shp->data (),
269
271
device_axes_shapes_strides,
270
272
packed_host_axes_shapes_strides_shp->size ());
271
- exec_q.submit ([&](sycl::handler &cgh) {
272
- cgh.depends_on (device_axes_shapes_strides_copy_ev);
273
- cgh.host_task ([packed_host_axes_shapes_strides_shp]() {});
274
- });
273
+
274
+ sycl::event clean_up_host_task_ev =
275
+ exec_q.submit ([&](sycl::handler &cgh) {
276
+ cgh.depends_on (device_axes_shapes_strides_copy_ev);
277
+ cgh.host_task ([packed_host_axes_shapes_strides_shp]() {});
278
+ });
279
+ host_task_events.push_back (clean_up_host_task_ev);
275
280
276
281
std::vector<sycl::event> v = {device_orthog_shapes_strides_fill_ev,
277
282
device_axes_shapes_strides_copy_ev};
@@ -590,28 +595,33 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
590
595
std::copy (ind_offsets.begin (), ind_offsets.end (),
591
596
host_ind_offsets_shp->begin ());
592
597
598
+ std::vector<sycl::event> host_task_events (5 );
599
+
593
600
sycl::event packed_ind_ptrs_copy_ev = exec_q.copy <char *>(
594
601
host_ind_ptrs_shp->data (), packed_ind_ptrs, host_ind_ptrs_shp->size ());
595
- exec_q.submit ([&](sycl::handler &cgh) {
602
+ sycl::event ind_ptrs_host_task = exec_q.submit ([&](sycl::handler &cgh) {
596
603
cgh.depends_on (packed_ind_ptrs_copy_ev);
597
604
cgh.host_task ([host_ind_ptrs_shp]() {});
598
605
});
606
+ host_task_events.push_back (ind_ptrs_host_task);
599
607
600
608
sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy <py::ssize_t >(
601
609
host_ind_shapes_strides_shp->data (), packed_ind_shapes_strides,
602
610
host_ind_shapes_strides_shp->size ());
603
- exec_q.submit ([&](sycl::handler &cgh) {
611
+ sycl::event ind_sh_st_host_task = exec_q.submit ([&](sycl::handler &cgh) {
604
612
cgh.depends_on (packed_ind_shapes_strides_copy_ev);
605
613
cgh.host_task ([host_ind_shapes_strides_shp]() {});
606
614
});
615
+ host_task_events.push_back (ind_sh_st_host_task);
607
616
608
617
sycl::event packed_ind_offsets_copy_ev = exec_q.copy <py::ssize_t >(
609
618
host_ind_offsets_shp->data (), packed_ind_offsets,
610
619
host_ind_offsets_shp->size ());
611
- exec_q.submit ([&](sycl::handler &cgh) {
620
+ sycl::event ind_offsets_host_task = exec_q.submit ([&](sycl::handler &cgh) {
612
621
cgh.depends_on (packed_ind_offsets_copy_ev);
613
622
cgh.host_task ([host_ind_offsets_shp]() {});
614
623
});
624
+ host_task_events.push_back (ind_offsets_host_task);
615
625
616
626
std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
617
627
packed_ind_shapes_strides_copy_ev,
@@ -650,10 +660,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
650
660
651
661
std::vector<sycl::event> src_dst_pack_deps =
652
662
_populate_packed_shapes_strides_for_indexing (
653
- exec_q, packed_shapes_strides, packed_axes_shapes_strides ,
654
- src_shape, src_strides, is_src_c_contig, is_src_f_contig, dst_shape ,
655
- dst_strides, is_dst_c_contig, is_dst_f_contig, axis_start, k ,
656
- ind_nd, src_nd, dst_nd);
663
+ exec_q, host_task_events, packed_shapes_strides ,
664
+ packed_axes_shapes_strides, src_shape, src_strides, is_src_c_contig,
665
+ is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig ,
666
+ is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd);
657
667
658
668
std::vector<sycl::event> all_deps (depends.size () + ind_pack_depends.size () +
659
669
src_dst_pack_deps.size ());
@@ -690,9 +700,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
690
700
sycl::free (packed_ind_offsets, ctx);
691
701
});
692
702
});
703
+ host_task_events.push_back (take_generic_ev);
693
704
694
705
sycl::event host_task_ev =
695
- keep_args_alive (exec_q, {src, py_ind, dst}, {take_generic_ev} );
706
+ keep_args_alive (exec_q, {src, py_ind, dst}, host_task_events );
696
707
697
708
return std::make_pair (host_task_ev, take_generic_ev);
698
709
}
@@ -977,28 +988,33 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
977
988
std::copy (ind_offsets.begin (), ind_offsets.end (),
978
989
host_ind_offsets_shp->begin ());
979
990
991
+ std::vector<sycl::event> host_task_events (5 );
992
+
980
993
sycl::event device_ind_ptrs_copy_ev = exec_q.copy <char *>(
981
994
host_ind_ptrs_shp->data (), packed_ind_ptrs, host_ind_ptrs_shp->size ());
982
- exec_q.submit ([&](sycl::handler &cgh) {
995
+ sycl::event ind_ptrs_host_task = exec_q.submit ([&](sycl::handler &cgh) {
983
996
cgh.depends_on (device_ind_ptrs_copy_ev);
984
997
cgh.host_task ([host_ind_ptrs_shp]() {});
985
998
});
999
+ host_task_events.push_back (ind_ptrs_host_task);
986
1000
987
1001
sycl::event device_ind_shapes_strides_copy_ev = exec_q.copy <py::ssize_t >(
988
1002
host_ind_shapes_strides_shp->data (), packed_ind_shapes_strides,
989
1003
host_ind_shapes_strides_shp->size ());
990
- exec_q.submit ([&](sycl::handler &cgh) {
1004
+ sycl::event ind_sh_st_host_task = exec_q.submit ([&](sycl::handler &cgh) {
991
1005
cgh.depends_on (device_ind_shapes_strides_copy_ev);
992
1006
cgh.host_task ([host_ind_shapes_strides_shp]() {});
993
1007
});
1008
+ host_task_events.push_back (ind_sh_st_host_task);
994
1009
995
1010
sycl::event device_ind_offsets_copy_ev = exec_q.copy <py::ssize_t >(
996
1011
host_ind_offsets_shp->data (), packed_ind_offsets,
997
1012
host_ind_offsets_shp->size ());
998
- exec_q.submit ([&](sycl::handler &cgh) {
1013
+ sycl::event ind_offsets_host_task = exec_q.submit ([&](sycl::handler &cgh) {
999
1014
cgh.depends_on (device_ind_offsets_copy_ev);
1000
1015
cgh.host_task ([host_ind_offsets_shp]() {});
1001
1016
});
1017
+ host_task_events.push_back (ind_offsets_host_task);
1002
1018
1003
1019
std::vector<sycl::event> ind_pack_depends{device_ind_ptrs_copy_ev,
1004
1020
device_ind_shapes_strides_copy_ev,
@@ -1037,10 +1053,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
1037
1053
1038
1054
std::vector<sycl::event> copy_shapes_strides_deps =
1039
1055
_populate_packed_shapes_strides_for_indexing (
1040
- exec_q, packed_shapes_strides, packed_axes_shapes_strides ,
1041
- dst_shape, dst_strides, is_dst_c_contig, is_dst_f_contig, val_shape ,
1042
- val_strides, is_val_c_contig, is_val_f_contig, axis_start, k ,
1043
- ind_nd, dst_nd, val_nd);
1056
+ exec_q, host_task_events, packed_shapes_strides ,
1057
+ packed_axes_shapes_strides, dst_shape, dst_strides, is_dst_c_contig,
1058
+ is_dst_f_contig, val_shape, val_strides, is_val_c_contig ,
1059
+ is_val_f_contig, axis_start, k, ind_nd, dst_nd, val_nd);
1044
1060
1045
1061
std::vector<sycl::event> all_deps (depends.size () +
1046
1062
copy_shapes_strides_deps.size () +
@@ -1078,9 +1094,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
1078
1094
sycl::free (packed_ind_offsets, ctx);
1079
1095
});
1080
1096
});
1097
+ host_task_events.push_back (put_generic_ev);
1081
1098
1082
1099
return std::make_pair (
1083
- keep_args_alive (exec_q, {dst, py_ind, val}, {put_generic_ev} ),
1100
+ keep_args_alive (exec_q, {dst, py_ind, val}, host_task_events ),
1084
1101
put_generic_ev);
1085
1102
}
1086
1103
0 commit comments