Skip to content

Commit 46d8a8d

Browse files
Optimized transfer of shape/strides to kernel in copy from ndarray
Applied optimization of replacing 3 queue.copy calls to copy shape, src_strides, dst_strides to copy host meta-data into USM allocation for use in copy_and_cast kernel with creating packed vector on the host and using a single queue.copy call of the packed host vector to USM allocation.w
1 parent 22cdb5a commit 46d8a8d

File tree

1 file changed

+11
-36
lines changed

1 file changed

+11
-36
lines changed

dpctl/tensor/libtensor/source/tensor_py.cpp

Lines changed: 11 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1331,11 +1331,6 @@ void copy_numpy_ndarray_into_usm_ndarray(
13311331
// Create shared pointers with shape and src/dst strides, copy into device
13321332
// memory
13331333
using shT = std::vector<py::ssize_t>;
1334-
std::shared_ptr<shT> shp_shape = std::make_shared<shT>(simplified_shape);
1335-
std::shared_ptr<shT> shp_src_strides =
1336-
std::make_shared<shT>(simplified_src_strides);
1337-
std::shared_ptr<shT> shp_dst_strides =
1338-
std::make_shared<shT>(simplified_dst_strides);
13391334

13401335
// Get implementation function pointer
13411336
auto copy_and_cast_from_host_blocking_fn =
@@ -1351,42 +1346,22 @@ void copy_numpy_ndarray_into_usm_ndarray(
13511346
throw std::runtime_error("Unabled to allocate device memory");
13521347
}
13531348

1354-
sycl::event copy_shape_ev =
1355-
exec_q.copy<py::ssize_t>(shp_shape->data(), shape_strides, nd);
1356-
1357-
exec_q.submit([&](sycl::handler &cgh) {
1358-
cgh.depends_on(copy_shape_ev);
1359-
cgh.host_task([shp_shape]() {
1360-
// increment shared pointer ref-count to keep it alive
1361-
// till copy operation completes;
1362-
});
1363-
});
1364-
1365-
sycl::event copy_src_strides_ev = exec_q.copy<py::ssize_t>(
1366-
shp_src_strides->data(), shape_strides + nd, nd);
1367-
exec_q.submit([&](sycl::handler &cgh) {
1368-
cgh.depends_on(copy_src_strides_ev);
1369-
cgh.host_task([shp_src_strides]() {
1370-
// increment shared pointer ref-count to keep it alive
1371-
// till copy operation completes;
1372-
});
1373-
});
1349+
std::shared_ptr<shT> host_shape_strides_shp = std::make_shared<shT>(3 * nd);
1350+
std::copy(simplified_shape.begin(), simplified_shape.end(),
1351+
host_shape_strides_shp->begin());
1352+
std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
1353+
host_shape_strides_shp->begin() + nd);
1354+
std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
1355+
host_shape_strides_shp->begin() + 2 * nd);
13741356

1375-
sycl::event copy_dst_strides_ev = exec_q.copy<py::ssize_t>(
1376-
shp_dst_strides->data(), shape_strides + 2 * nd, nd);
1377-
exec_q.submit([&](sycl::handler &cgh) {
1378-
cgh.depends_on(copy_dst_strides_ev);
1379-
cgh.host_task([shp_dst_strides]() {
1380-
// increment shared pointer ref-count to keep it alive
1381-
// till copy operation completes;
1382-
});
1383-
});
1357+
sycl::event copy_packed_ev =
1358+
exec_q.copy<py::ssize_t>(host_shape_strides_shp->data(), shape_strides,
1359+
host_shape_strides_shp->size());
13841360

13851361
copy_and_cast_from_host_blocking_fn(
13861362
exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
13871363
npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
1388-
dst_offset, depends,
1389-
{copy_shape_ev, copy_src_strides_ev, copy_dst_strides_ev});
1364+
dst_offset, depends, {copy_packed_ev});
13901365

13911366
sycl::free(shape_strides, exec_q);
13921367

0 commit comments

Comments
 (0)