Skip to content

Commit b69a415

Browse files
Make both _take and _put effectively synchronous
They still return a pair of events, but those are always in a compelte state.
1 parent f84239f commit b69a415

File tree

1 file changed

+88
-35
lines changed

1 file changed

+88
-35
lines changed

dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp

Lines changed: 88 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
143143
packed_host_axes_shapes_strides_shp->begin() + k);
144144
}
145145
else {
146+
// FIXME: this pointer was not allocated in this function
147+
// the caller should be freeing it
146148
sycl::free(device_orthog_shapes_strides, exec_q);
147149
throw std::runtime_error("Invalid array encountered");
148150
}
@@ -190,6 +192,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
190192
packed_host_axes_shapes_strides_shp->begin() + 2 * k);
191193
}
192194
else {
195+
// FIXME: this pointer was not allocated in this function
196+
// the caller should be freeing it
193197
sycl::free(device_orthog_shapes_strides, exec_q);
194198
throw std::runtime_error("Invalid array encountered");
195199
}
@@ -255,6 +259,8 @@ std::vector<sycl::event> _populate_packed_shapes_strides_for_indexing(
255259
packed_host_axes_shapes_strides_shp->begin() + 2);
256260
}
257261
else {
262+
// FIXME: memory was not allocated in this function
263+
// it should be freed by the caller
258264
sycl::free(device_orthog_shapes_strides, exec_q);
259265
throw std::runtime_error("Invalid array encountered");
260266
}
@@ -292,23 +298,24 @@ std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
292298
std::vector<dpctl::tensor::usm_ndarray> res;
293299
res.reserve(ind_count);
294300

295-
bool acquired = false;
301+
bool nd_is_known = false;
296302
int nd = -1;
297303
for (size_t i = 0; i < ind_count; ++i) {
298-
auto el_i = py_ind[py::cast(i)];
299-
auto arr_i = py::cast<dpctl::tensor::usm_ndarray>(el_i);
304+
py::object el_i = py_ind[py::cast(i)];
305+
dpctl::tensor::usm_ndarray arr_i =
306+
py::cast<dpctl::tensor::usm_ndarray>(el_i);
300307
if (!dpctl::utils::queues_are_compatible(q, {arr_i})) {
301308
throw py::value_error("Index allocation queue is not compatible "
302309
"with execution queue");
303310
}
304-
if (acquired) {
311+
if (nd_is_known) {
305312
if (nd != arr_i.get_ndim()) {
306313
throw py::value_error(
307314
"Indices must have the same number of dimensions.");
308315
}
309316
}
310317
else {
311-
acquired = true;
318+
nd_is_known = true;
312319
nd = arr_i.get_ndim();
313320
}
314321
res.push_back(arr_i);
@@ -558,6 +565,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
558565
sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
559566

560567
if (packed_ind_shapes_strides == nullptr) {
568+
sycl::free(packed_ind_ptrs, exec_q);
561569
throw std::runtime_error(
562570
"Unable to allocate packed_ind_shapes_strides device memory");
563571
}
@@ -566,6 +574,8 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
566574
sycl::malloc_device<py::ssize_t>(k, exec_q);
567575

568576
if (packed_ind_offsets == nullptr) {
577+
sycl::free(packed_ind_ptrs, exec_q);
578+
sycl::free(packed_ind_shapes_strides, exec_q);
569579
throw std::runtime_error(
570580
"Unable to allocate packed_ind_offsets device memory");
571581
}
@@ -595,33 +605,29 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
595605
std::copy(ind_offsets.begin(), ind_offsets.end(),
596606
host_ind_offsets_shp->begin());
597607

598-
std::vector<sycl::event> host_task_events(5);
608+
std::vector<sycl::event> host_task_events;
609+
host_task_events.reserve(5);
599610

600611
sycl::event packed_ind_ptrs_copy_ev = exec_q.copy<char *>(
601612
host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
602-
sycl::event ind_ptrs_host_task = exec_q.submit([&](sycl::handler &cgh) {
603-
cgh.depends_on(packed_ind_ptrs_copy_ev);
604-
cgh.host_task([host_ind_ptrs_shp]() {});
605-
});
606-
host_task_events.push_back(ind_ptrs_host_task);
607613

608614
sycl::event packed_ind_shapes_strides_copy_ev = exec_q.copy<py::ssize_t>(
609615
host_ind_shapes_strides_shp->data(), packed_ind_shapes_strides,
610616
host_ind_shapes_strides_shp->size());
611-
sycl::event ind_sh_st_host_task = exec_q.submit([&](sycl::handler &cgh) {
612-
cgh.depends_on(packed_ind_shapes_strides_copy_ev);
613-
cgh.host_task([host_ind_shapes_strides_shp]() {});
614-
});
615-
host_task_events.push_back(ind_sh_st_host_task);
616617

617618
sycl::event packed_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
618619
host_ind_offsets_shp->data(), packed_ind_offsets,
619620
host_ind_offsets_shp->size());
620-
sycl::event ind_offsets_host_task = exec_q.submit([&](sycl::handler &cgh) {
621-
cgh.depends_on(packed_ind_offsets_copy_ev);
622-
cgh.host_task([host_ind_offsets_shp]() {});
623-
});
624-
host_task_events.push_back(ind_offsets_host_task);
621+
622+
sycl::event shared_ptr_cleanup_host_task =
623+
exec_q.submit([&](sycl::handler &cgh) {
624+
cgh.depends_on({packed_ind_offsets_copy_ev,
625+
packed_ind_shapes_strides_copy_ev,
626+
packed_ind_ptrs_copy_ev});
627+
cgh.host_task([host_ind_offsets_shp, host_ind_shapes_strides_shp,
628+
host_ind_ptrs_shp]() {});
629+
});
630+
host_task_events.push_back(shared_ptr_cleanup_host_task);
625631

626632
std::vector<sycl::event> ind_pack_depends{packed_ind_ptrs_copy_ev,
627633
packed_ind_shapes_strides_copy_ev,
@@ -643,6 +649,10 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
643649
sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
644650

645651
if (packed_shapes_strides == nullptr) {
652+
sycl::event::wait(host_task_events);
653+
sycl::free(packed_ind_ptrs, exec_q);
654+
sycl::free(packed_ind_shapes_strides, exec_q);
655+
sycl::free(packed_ind_offsets, exec_q);
646656
throw std::runtime_error(
647657
"Unable to allocate packed_shapes_strides device memory");
648658
}
@@ -654,6 +664,11 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
654664
sycl::malloc_device<py::ssize_t>((2 * k) + ind_sh_elems, exec_q);
655665

656666
if (packed_axes_shapes_strides == nullptr) {
667+
sycl::event::wait(host_task_events);
668+
sycl::free(packed_ind_ptrs, exec_q);
669+
sycl::free(packed_ind_shapes_strides, exec_q);
670+
sycl::free(packed_ind_offsets, exec_q);
671+
sycl::free(packed_shapes_strides, exec_q);
657672
throw std::runtime_error(
658673
"Unable to allocate packed_axes_shapes_strides device memory");
659674
}
@@ -665,8 +680,9 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
665680
is_src_f_contig, dst_shape, dst_strides, is_dst_c_contig,
666681
is_dst_f_contig, axis_start, k, ind_nd, src_nd, dst_nd);
667682

668-
std::vector<sycl::event> all_deps(depends.size() + ind_pack_depends.size() +
669-
src_dst_pack_deps.size());
683+
std::vector<sycl::event> all_deps;
684+
all_deps.reserve(depends.size() + ind_pack_depends.size() +
685+
src_dst_pack_deps.size());
670686
all_deps.insert(std::end(all_deps), std::begin(ind_pack_depends),
671687
std::end(ind_pack_depends));
672688
all_deps.insert(std::end(all_deps), std::begin(src_dst_pack_deps),
@@ -676,6 +692,12 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
676692
auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
677693

678694
if (fn == nullptr) {
695+
sycl::event::wait(host_task_events);
696+
sycl::free(packed_ind_ptrs, exec_q);
697+
sycl::free(packed_ind_shapes_strides, exec_q);
698+
sycl::free(packed_ind_offsets, exec_q);
699+
sycl::free(packed_shapes_strides, exec_q);
700+
sycl::free(packed_axes_shapes_strides, exec_q);
679701
throw std::runtime_error("Indices must be integer type, got " +
680702
std::to_string(ind_type_id));
681703
}
@@ -687,7 +709,7 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
687709
src_offset, dst_offset, packed_ind_offsets, all_deps);
688710

689711
// free packed temporaries
690-
exec_q.submit([&](sycl::handler &cgh) {
712+
sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
691713
cgh.depends_on(take_generic_ev);
692714
auto ctx = exec_q.get_context();
693715
cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides,
@@ -700,12 +722,16 @@ usm_ndarray_take(dpctl::tensor::usm_ndarray src,
700722
sycl::free(packed_ind_offsets, ctx);
701723
});
702724
});
703-
host_task_events.push_back(take_generic_ev);
704725

705-
sycl::event host_task_ev =
706-
keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events);
726+
sycl::event::wait(host_task_events);
727+
sycl::event::wait({take_generic_ev, temporaries_cleanup_ev});
728+
729+
/*
730+
sycl::event host_task_ev = keep_args_alive(exec_q, {src, py_ind, dst},
731+
{temporaries_cleanup_ev});
732+
*/
707733

708-
return std::make_pair(host_task_ev, take_generic_ev);
734+
return std::make_pair(sycl::event(), temporaries_cleanup_ev);
709735
}
710736

711737
std::pair<sycl::event, sycl::event>
@@ -951,6 +977,7 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
951977
sycl::malloc_device<py::ssize_t>((k + 1) * ind_sh_elems, exec_q);
952978

953979
if (packed_ind_shapes_strides == nullptr) {
980+
sycl::free(packed_ind_ptrs, exec_q);
954981
throw std::runtime_error(
955982
"Unable to allocate packed_ind_shapes_strides device memory");
956983
}
@@ -959,6 +986,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
959986
sycl::malloc_device<py::ssize_t>(k, exec_q);
960987

961988
if (packed_ind_offsets == nullptr) {
989+
sycl::free(packed_ind_ptrs, exec_q);
990+
sycl::free(packed_ind_shapes_strides, exec_q);
962991
throw std::runtime_error(
963992
"Unable to allocate packed_ind_offsets device memory");
964993
}
@@ -988,7 +1017,8 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
9881017
std::copy(ind_offsets.begin(), ind_offsets.end(),
9891018
host_ind_offsets_shp->begin());
9901019

991-
std::vector<sycl::event> host_task_events(5);
1020+
std::vector<sycl::event> host_task_events;
1021+
host_task_events.reserve(7);
9921022

9931023
sycl::event device_ind_ptrs_copy_ev = exec_q.copy<char *>(
9941024
host_ind_ptrs_shp->data(), packed_ind_ptrs, host_ind_ptrs_shp->size());
@@ -1036,6 +1066,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10361066
sycl::malloc_device<py::ssize_t>(3 * sh_elems, exec_q);
10371067

10381068
if (packed_shapes_strides == nullptr) {
1069+
sycl::event::wait(host_task_events);
1070+
sycl::free(packed_ind_ptrs, exec_q);
1071+
sycl::free(packed_ind_shapes_strides, exec_q);
1072+
sycl::free(packed_ind_offsets, exec_q);
10391073
throw std::runtime_error(
10401074
"Unable to allocate packed_shapes_strides device memory");
10411075
}
@@ -1047,6 +1081,11 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10471081
sycl::malloc_device<py::ssize_t>((2 * k) + ind_sh_elems, exec_q);
10481082

10491083
if (packed_axes_shapes_strides == nullptr) {
1084+
sycl::event::wait(host_task_events);
1085+
sycl::free(packed_shapes_strides, exec_q);
1086+
sycl::free(packed_ind_ptrs, exec_q);
1087+
sycl::free(packed_ind_shapes_strides, exec_q);
1088+
sycl::free(packed_ind_offsets, exec_q);
10501089
throw std::runtime_error(
10511090
"Unable to allocate packed_axes_shapes_strides device memory");
10521091
}
@@ -1070,6 +1109,13 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10701109
auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
10711110

10721111
if (fn == nullptr) {
1112+
sycl::event::wait(host_task_events);
1113+
sycl::free(packed_shapes_strides, exec_q);
1114+
sycl::free(packed_axes_shapes_strides, exec_q);
1115+
sycl::free(packed_ind_shapes_strides, exec_q);
1116+
sycl::free(packed_ind_ptrs, exec_q);
1117+
sycl::free(packed_ind_offsets, exec_q);
1118+
10731119
throw std::runtime_error("Indices must be integer type, got " +
10741120
std::to_string(ind_type_id));
10751121
}
@@ -1081,9 +1127,10 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10811127
dst_offset, val_offset, packed_ind_offsets, all_deps);
10821128

10831129
// free packed temporaries
1084-
auto ctx = exec_q.get_context();
1085-
exec_q.submit([&](sycl::handler &cgh) {
1130+
1131+
sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
10861132
cgh.depends_on(put_generic_ev);
1133+
auto ctx = exec_q.get_context();
10871134
cgh.host_task([packed_shapes_strides, packed_axes_shapes_strides,
10881135
packed_ind_shapes_strides, packed_ind_ptrs,
10891136
packed_ind_offsets, ctx]() {
@@ -1094,11 +1141,17 @@ usm_ndarray_put(dpctl::tensor::usm_ndarray dst,
10941141
sycl::free(packed_ind_offsets, ctx);
10951142
});
10961143
});
1097-
host_task_events.push_back(put_generic_ev);
10981144

1099-
return std::make_pair(
1100-
keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events),
1101-
put_generic_ev);
1145+
sycl::event::wait(host_task_events);
1146+
sycl::event::wait({put_generic_ev, temporaries_cleanup_ev});
1147+
1148+
/*
1149+
sycl::event py_obj_cleanup_ev =
1150+
keep_args_alive(exec_q, {dst, py_ind, val},
1151+
{put_generic_ev, temporaries_cleanup_ev});
1152+
*/
1153+
1154+
return std::make_pair(sycl::event(), temporaries_cleanup_ev);
11021155
}
11031156

11041157
void init_advanced_indexing_dispatch_tables(void)

0 commit comments

Comments
 (0)