@@ -143,6 +143,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
143
143
packed_host_axes_shapes_strides_shp->begin () + k);
144
144
}
145
145
else {
146
+ // FIXME: this pointer was not allocated in this function
147
+ // the caller should be freeing it
146
148
sycl::free (device_orthog_shapes_strides, exec_q);
147
149
throw std::runtime_error (" Invalid array encountered" );
148
150
}
@@ -190,6 +192,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
190
192
packed_host_axes_shapes_strides_shp->begin () + 2 * k);
191
193
}
192
194
else {
195
+ // FIXME: this pointer was not allocated in this function
196
+ // the caller should be freeing it
193
197
sycl::free (device_orthog_shapes_strides, exec_q);
194
198
throw std::runtime_error (" Invalid array encountered" );
195
199
}
@@ -255,6 +259,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
255
259
packed_host_axes_shapes_strides_shp->begin () + 2 );
256
260
}
257
261
else {
262
+ // FIXME: memory was not allocated in this function
263
+ // it should be freed by the caller
258
264
sycl::free (device_orthog_shapes_strides, exec_q);
259
265
throw std::runtime_error (" Invalid array encountered" );
260
266
}
@@ -292,23 +298,24 @@ std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
292
298
std::vector<dpctl::tensor::usm_ndarray> res;
293
299
res.reserve (ind_count);
294
300
295
- bool acquired = false ;
301
+ bool nd_is_known = false ;
296
302
int nd = -1 ;
297
303
for (size_t i = 0 ; i < ind_count; ++i) {
298
- auto el_i = py_ind[py::cast (i)];
299
- auto arr_i = py::cast<dpctl::tensor::usm_ndarray>(el_i);
304
+ py::object el_i = py_ind[py::cast (i)];
305
+ dpctl::tensor::usm_ndarray arr_i =
306
+ py::cast<dpctl::tensor::usm_ndarray>(el_i);
300
307
if (!dpctl::utils::queues_are_compatible (q, {arr_i})) {
301
308
throw py::value_error (" Index allocation queue is not compatible "
302
309
" with execution queue" );
303
310
}
304
- if (acquired ) {
311
+ if (nd_is_known ) {
305
312
if (nd != arr_i.get_ndim ()) {
306
313
throw py::value_error (
307
314
" Indices must have the same number of dimensions." );
308
315
}
309
316
}
310
317
else {
311
- acquired = true ;
318
+ nd_is_known = true ;
312
319
nd = arr_i.get_ndim ();
313
320
}
314
321
res.push_back (arr_i);
@@ -558,6 +565,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
558
565
sycl::malloc_device<py::ssize_t >((k + 1 ) * ind_sh_elems, exec_q);
559
566
560
567
if (packed_ind_shapes_strides == nullptr ) {
568
+ sycl::free (packed_ind_ptrs, exec_q);
561
569
throw std::runtime_error (
562
570
" Unable to allocate packed_ind_shapes_strides device memory" );
563
571
}
@@ -566,6 +574,8 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
566
574
sycl::malloc_device<py::ssize_t >(k, exec_q);
567
575
568
576
if (packed_ind_offsets == nullptr ) {
577
+ sycl::free (packed_ind_ptrs, exec_q);
578
+ sycl::free (packed_ind_shapes_strides, exec_q);
569
579
throw std::runtime_error (
570
580
" Unable to allocate packed_ind_offsets device memory" );
571
581
}
@@ -595,33 +605,29 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
595
605
std::copy (ind_offsets.begin (), ind_offsets.end (),
596
606
host_ind_offsets_shp->begin ());
597
607
598
- std::vector<sycl::event> host_task_events (5 );
608
+ std::vector<sycl::event> host_task_events;
609
+ host_task_events.reserve (5 );
599
610
600
611
sycl::event packed_ind_ptrs_copy_ev = exec_q.copy <char *>(
601
612
host_ind_ptrs_shp->data (), packed_ind_ptrs, host_ind_ptrs_shp->size ());
602
- sycl::event ind_ptrs_host_task = exec_q.submit ([&](sycl::handler &cgh) {
603
- cgh.depends_on (packed_ind_ptrs_copy_ev);
604
- cgh.host_task ([host_ind_ptrs_shp]() {});
605
- });
606
- host_task_events.push_back (ind_ptrs_host_task);
607
613
608
614
sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy <py::ssize_t >(
609
615
host_ind_shapes_strides_shp->data (), packed_ind_shapes_strides,
610
616
host_ind_shapes_strides_shp->size ());
611
- sycl::event ind_sh_st_host_task = exec_q.submit ([&](sycl::handler &cgh) {
612
- cgh.depends_on (packed_ind_shapes_strides_copy_ev);
613
- cgh.host_task ([host_ind_shapes_strides_shp]() {});
614
- });
615
- host_task_events.push_back (ind_sh_st_host_task);
616
617
617
618
sycl::event packed_ind_offsets_copy_ev = exec_q.copy <py::ssize_t >(
618
619
host_ind_offsets_shp->data (), packed_ind_offsets,
619
620
host_ind_offsets_shp->size ());
620
- sycl::event ind_offsets_host_task = exec_q.submit ([&](sycl::handler &cgh) {
621
- cgh.depends_on (packed_ind_offsets_copy_ev);
622
- cgh.host_task ([host_ind_offsets_shp]() {});
623
- });
624
- host_task_events.push_back (ind_offsets_host_task);
621
+
622
+ sycl::event shared_ptr_cleanup_host_task =
623
+ exec_q.submit ([&](sycl::handler &cgh) {
624
+ cgh.depends_on ({packed_ind_offsets_copy_ev,
625
+ packed_ind_shapes_strides_copy_ev,
626
+ packed_ind_ptrs_copy_ev});
627
+ cgh.host_task ([host_ind_offsets_shp, host_ind_shapes_strides_shp,
628
+ host_ind_ptrs_shp]() {});
629
+ });
630
+ host_task_events.push_back (shared_ptr_cleanup_host_task);
625
631
626
632
std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
627
633
packed_ind_shapes_strides_copy_ev,
@@ -643,6 +649,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
643
649
sycl::malloc_device<py::ssize_t >(3 * sh_elems, exec_q);
644
650
645
651
if (packed_shapes_strides == nullptr ) {
652
+ sycl::event::wait (host_task_events);
653
+ sycl::free (packed_ind_ptrs, exec_q);
654
+ sycl::free (packed_ind_shapes_strides, exec_q);
655
+ sycl::free (packed_ind_offsets, exec_q);
646
656
throw std::runtime_error (
647
657
" Unable to allocate packed_shapes_strides device memory" );
648
658
}
@@ -654,6 +664,11 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
654
664
sycl::malloc_device<py::ssize_t >((2 * k) + ind_sh_elems, exec_q);
655
665
656
666
if (packed_axes_shapes_strides == nullptr ) {
667
+ sycl::event::wait (host_task_events);
668
+ sycl::free (packed_ind_ptrs, exec_q);
669
+ sycl::free (packed_ind_shapes_strides, exec_q);
670
+ sycl::free (packed_ind_offsets, exec_q);
671
+ sycl::free (packed_shapes_strides, exec_q);
657
672
throw std::runtime_error (
658
673
" Unable to allocate packed_axes_shapes_strides device memory" );
659
674
}
@@ -665,8 +680,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
665
680
is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig,
666
681
is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd);
667
682
668
- std::vector<sycl::event> all_deps (depends.size () + ind_pack_depends.size () +
669
- src_dst_pack_deps.size ());
683
+ std::vector<sycl::event> all_deps;
684
+ all_deps.reserve (depends.size () + ind_pack_depends.size () +
685
+ src_dst_pack_deps.size ());
670
686
all_deps.insert (std::end (all_deps), std::begin (ind_pack_depends),
671
687
std::end (ind_pack_depends));
672
688
all_deps.insert (std::end (all_deps), std::begin (src_dst_pack_deps),
@@ -676,6 +692,12 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
676
692
auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
677
693
678
694
if (fn == nullptr ) {
695
+ sycl::event::wait (host_task_events);
696
+ sycl::free (packed_ind_ptrs, exec_q);
697
+ sycl::free (packed_ind_shapes_strides, exec_q);
698
+ sycl::free (packed_ind_offsets, exec_q);
699
+ sycl::free (packed_shapes_strides, exec_q);
700
+ sycl::free (packed_axes_shapes_strides, exec_q);
679
701
throw std::runtime_error (" Indices must be integer type, got " +
680
702
std::to_string (ind_type_id));
681
703
}
@@ -687,7 +709,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
687
709
src_offset, dst_offset, packed_ind_offsets, all_deps);
688
710
689
711
// free packed temporaries
690
- exec_q.submit ([&](sycl::handler &cgh) {
712
+ sycl::event temporaries_cleanup_ev = exec_q.submit ([&](sycl::handler &cgh) {
691
713
cgh.depends_on (take_generic_ev);
692
714
auto ctx = exec_q.get_context ();
693
715
cgh.host_task ([packed_shapes_strides, packed_axes_shapes_strides,
@@ -700,12 +722,16 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
700
722
sycl::free (packed_ind_offsets, ctx);
701
723
});
702
724
});
703
- host_task_events.push_back (take_generic_ev);
704
725
705
- sycl::event host_task_ev =
706
- keep_args_alive (exec_q, {src, py_ind, dst}, host_task_events);
726
+ sycl::event::wait (host_task_events);
727
+ sycl::event::wait ({take_generic_ev, temporaries_cleanup_ev});
728
+
729
+ /*
730
+ sycl::event host_task_ev = keep_args_alive(exec_q, {src, py_ind, dst},
731
+ {temporaries_cleanup_ev});
732
+ */
707
733
708
- return std::make_pair (host_task_ev, take_generic_ev );
734
+ return std::make_pair (sycl::event (), temporaries_cleanup_ev );
709
735
}
710
736
711
737
std::pair<sycl::event, sycl::event>
@@ -951,6 +977,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
951
977
sycl::malloc_device<py::ssize_t >((k + 1 ) * ind_sh_elems, exec_q);
952
978
953
979
if (packed_ind_shapes_strides == nullptr ) {
980
+ sycl::free (packed_ind_ptrs, exec_q);
954
981
throw std::runtime_error (
955
982
" Unable to allocate packed_ind_shapes_strides device memory" );
956
983
}
@@ -959,6 +986,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
959
986
sycl::malloc_device<py::ssize_t >(k, exec_q);
960
987
961
988
if (packed_ind_offsets == nullptr ) {
989
+ sycl::free (packed_ind_ptrs, exec_q);
990
+ sycl::free (packed_ind_shapes_strides, exec_q);
962
991
throw std::runtime_error (
963
992
" Unable to allocate packed_ind_offsets device memory" );
964
993
}
@@ -988,7 +1017,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
988
1017
std::copy (ind_offsets.begin (), ind_offsets.end (),
989
1018
host_ind_offsets_shp->begin ());
990
1019
991
- std::vector<sycl::event> host_task_events (5 );
1020
+ std::vector<sycl::event> host_task_events;
1021
+ host_task_events.reserve (7 );
992
1022
993
1023
sycl::event device_ind_ptrs_copy_ev = exec_q.copy <char *>(
994
1024
host_ind_ptrs_shp->data (), packed_ind_ptrs, host_ind_ptrs_shp->size ());
@@ -1036,6 +1066,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
1036
1066
sycl::malloc_device<py::ssize_t >(3 * sh_elems, exec_q);
1037
1067
1038
1068
if (packed_shapes_strides == nullptr ) {
1069
+ sycl::event::wait (host_task_events);
1070
+ sycl::free (packed_ind_ptrs, exec_q);
1071
+ sycl::free (packed_ind_shapes_strides, exec_q);
1072
+ sycl::free (packed_ind_offsets, exec_q);
1039
1073
throw std::runtime_error (
1040
1074
" Unable to allocate packed_shapes_strides device memory" );
1041
1075
}
@@ -1047,6 +1081,11 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
1047
1081
sycl::malloc_device<py::ssize_t >((2 * k) + ind_sh_elems, exec_q);
1048
1082
1049
1083
if (packed_axes_shapes_strides == nullptr ) {
1084
+ sycl::event::wait (host_task_events);
1085
+ sycl::free (packed_shapes_strides, exec_q);
1086
+ sycl::free (packed_ind_ptrs, exec_q);
1087
+ sycl::free (packed_ind_shapes_strides, exec_q);
1088
+ sycl::free (packed_ind_offsets, exec_q);
1050
1089
throw std::runtime_error (
1051
1090
" Unable to allocate packed_axes_shapes_strides device memory" );
1052
1091
}
@@ -1070,6 +1109,13 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
1070
1109
auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
1071
1110
1072
1111
if (fn == nullptr ) {
1112
+ sycl::event::wait (host_task_events);
1113
+ sycl::free (packed_shapes_strides, exec_q);
1114
+ sycl::free (packed_axes_shapes_strides, exec_q);
1115
+ sycl::free (packed_ind_shapes_strides, exec_q);
1116
+ sycl::free (packed_ind_ptrs, exec_q);
1117
+ sycl::free (packed_ind_offsets, exec_q);
1118
+
1073
1119
throw std::runtime_error (" Indices must be integer type, got " +
1074
1120
std::to_string (ind_type_id));
1075
1121
}
@@ -1081,9 +1127,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
1081
1127
dst_offset, val_offset, packed_ind_offsets, all_deps);
1082
1128
1083
1129
// free packed temporaries
1084
- auto ctx = exec_q. get_context ();
1085
- exec_q.submit ([&](sycl::handler &cgh) {
1130
+
1131
+ sycl::event temporaries_cleanup_ev = exec_q.submit ([&](sycl::handler &cgh) {
1086
1132
cgh.depends_on (put_generic_ev);
1133
+ auto ctx = exec_q.get_context ();
1087
1134
cgh.host_task ([packed_shapes_strides, packed_axes_shapes_strides,
1088
1135
packed_ind_shapes_strides, packed_ind_ptrs,
1089
1136
packed_ind_offsets, ctx]() {
@@ -1094,11 +1141,17 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
1094
1141
sycl::free (packed_ind_offsets, ctx);
1095
1142
});
1096
1143
});
1097
- host_task_events.push_back (put_generic_ev);
1098
1144
1099
- return std::make_pair (
1100
- keep_args_alive (exec_q, {dst, py_ind, val}, host_task_events),
1101
- put_generic_ev);
1145
+ sycl::event::wait (host_task_events);
1146
+ sycl::event::wait ({put_generic_ev, temporaries_cleanup_ev});
1147
+
1148
+ /*
1149
+ sycl::event py_obj_cleanup_ev =
1150
+ keep_args_alive(exec_q, {dst, py_ind, val},
1151
+ {put_generic_ev, temporaries_cleanup_ev});
1152
+ */
1153
+
1154
+ return std::make_pair (sycl::event (), temporaries_cleanup_ev);
1102
1155
}
1103
1156
1104
1157
void init_advanced_indexing_dispatch_tables (void )
0 commit comments