Skip to content

Commit 3a4d8ab

Browse files
oleksandr-pavlykndgrigorian
authored andcommitted
Use unique_ptr as temporary owner of USM allocation
Until it is passed over to the host function, and unique_ptr's ownership is released. Also reduced allocation sizes, where too much was being allocated. Introduce smart_malloc_device, etc. The smart_malloc_device<T>(count, q) makes USM allocation and returns a unique_ptr<T, USMDeleter> which owns the allocation. The function throws an exception (std::runtime_error) if USM allocation is not successful. Introduce async_smart_free. This function intends to replace use of host_task submissions to manage USM temporary deallocations. The usage is as follows: ``` // returns unique_ptr auto alloc_owner = smart_malloc_device<T>(count, q); // get raw pointer for use in kernels T *data = alloc_owner.get(); [..SNIP..] // submit host_task that releases the unique_ptr // after the host task was successfully submitted // and ownership of USM allocation is transfered to // the said host task sycl::event ht_ev = async_smart_free(q, dependent_events, alloc_owner); [...SNIP...] ```
1 parent 823c201 commit 3a4d8ab

File tree

3 files changed

+137
-92
lines changed

3 files changed

+137
-92
lines changed

dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp

Lines changed: 21 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,11 +1588,11 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
15881588
using CountT = std::uint32_t;
15891589

15901590
// memory for storing count and offset values
1591-
CountT *count_ptr =
1592-
sycl::malloc_device<CountT>(n_iters * n_counts, exec_q);
1593-
if (nullptr == count_ptr) {
1594-
throw std::runtime_error("Could not allocate USM-device memory");
1595-
}
1591+
auto count_owner =
1592+
dpctl::tensor::alloc_utils::smart_malloc_device<CountT>(
1593+
n_iters * n_counts, exec_q);
1594+
1595+
CountT *count_ptr = count_owner.get();
15961596

15971597
constexpr std::uint32_t zero_radix_iter{0};
15981598

@@ -1605,25 +1605,17 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
16051605
n_counts, count_ptr, proj_op,
16061606
is_ascending, depends);
16071607

1608-
sort_ev = exec_q.submit([=](sycl::handler &cgh) {
1609-
cgh.depends_on(sort_ev);
1610-
const sycl::context &ctx = exec_q.get_context();
1611-
1612-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
1613-
cgh.host_task(
1614-
[ctx, count_ptr]() { sycl_free_noexcept(count_ptr, ctx); });
1615-
});
1608+
sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
1609+
exec_q, {sort_ev}, count_owner);
16161610

16171611
return sort_ev;
16181612
}
16191613

1620-
ValueT *tmp_arr =
1621-
sycl::malloc_device<ValueT>(n_iters * n_to_sort, exec_q);
1622-
if (nullptr == tmp_arr) {
1623-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
1624-
sycl_free_noexcept(count_ptr, exec_q);
1625-
throw std::runtime_error("Could not allocate USM-device memory");
1626-
}
1614+
auto tmp_arr_owner =
1615+
dpctl::tensor::alloc_utils::smart_malloc_device<ValueT>(
1616+
n_iters * n_to_sort, exec_q);
1617+
1618+
ValueT *tmp_arr = tmp_arr_owner.get();
16271619

16281620
// iterations per each bucket
16291621
assert("Number of iterations must be even" && radix_iters % 2 == 0);
@@ -1657,17 +1649,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
16571649
}
16581650
}
16591651

1660-
sort_ev = exec_q.submit([=](sycl::handler &cgh) {
1661-
cgh.depends_on(sort_ev);
1662-
1663-
const sycl::context &ctx = exec_q.get_context();
1664-
1665-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
1666-
cgh.host_task([ctx, count_ptr, tmp_arr]() {
1667-
sycl_free_noexcept(tmp_arr, ctx);
1668-
sycl_free_noexcept(count_ptr, ctx);
1669-
});
1670-
});
1652+
sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
1653+
exec_q, {sort_ev}, tmp_arr_owner, count_owner);
16711654
}
16721655

16731656
return sort_ev;
@@ -1769,13 +1752,12 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
17691752
reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
17701753

17711754
const std::size_t total_nelems = iter_nelems * sort_nelems;
1772-
const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
1773-
IndexTy *workspace = sycl::malloc_device<IndexTy>(
1774-
padded_total_nelems + total_nelems, exec_q);
1755+
auto workspace_owner =
1756+
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(total_nelems,
1757+
exec_q);
17751758

1776-
if (nullptr == workspace) {
1777-
throw std::runtime_error("Could not allocate workspace on device");
1778-
}
1759+
// get raw USM pointer
1760+
IndexTy *workspace = workspace_owner.get();
17791761

17801762
using IdentityProjT = radix_sort_details::IdentityProj;
17811763
using IndexedProjT =
@@ -1820,14 +1802,8 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
18201802
});
18211803
});
18221804

1823-
sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
1824-
cgh.depends_on(map_back_ev);
1825-
1826-
const sycl::context &ctx = exec_q.get_context();
1827-
1828-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
1829-
cgh.host_task([ctx, workspace] { sycl_free_noexcept(workspace, ctx); });
1830-
});
1805+
sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
1806+
exec_q, {map_back_ev}, workspace_owner);
18311807

18321808
return cleanup_ev;
18331809
}

dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp

Lines changed: 24 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,10 @@
3030
#include <iterator>
3131
#include <limits>
3232
#include <stdexcept>
33-
#include <sycl/sycl.hpp>
3433
#include <vector>
3534

35+
#include <sycl/sycl.hpp>
36+
3637
#include "kernels/dpctl_tensor_types.hpp"
3738
#include "kernels/sorting/merge_sort.hpp"
3839
#include "kernels/sorting/radix_sort.hpp"
@@ -90,11 +91,11 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
9091
const CompT &comp,
9192
const std::vector<sycl::event> &depends)
9293
{
93-
IndexTy *index_data =
94-
sycl::malloc_device<IndexTy>(iter_nelems * axis_nelems, exec_q);
95-
if (index_data == nullptr) {
96-
throw std::runtime_error("Unable to allocate device_memory");
97-
}
94+
auto index_data_owner =
95+
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
96+
iter_nelems * axis_nelems, exec_q);
97+
// extract USM pointer
98+
IndexTy *index_data = index_data_owner.get();
9899

99100
using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy, CompT>;
100101

@@ -153,14 +154,8 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
153154
});
154155

155156
sycl::event cleanup_host_task_event =
156-
exec_q.submit([&](sycl::handler &cgh) {
157-
cgh.depends_on(write_out_ev);
158-
const sycl::context &ctx = exec_q.get_context();
159-
160-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
161-
cgh.host_task(
162-
[ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
163-
});
157+
dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
158+
index_data_owner);
164159

165160
return cleanup_host_task_event;
166161
};
@@ -283,11 +278,11 @@ sycl::event topk_merge_impl(
283278
index_comp, depends);
284279
}
285280

286-
IndexTy *index_data =
287-
sycl::malloc_device<IndexTy>(iter_nelems * alloc_len, exec_q);
288-
if (index_data == nullptr) {
289-
throw std::runtime_error("Unable to allocate device_memory");
290-
}
281+
auto index_data_owner =
282+
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
283+
iter_nelems * alloc_len, exec_q);
284+
// get raw USM pointer
285+
IndexTy *index_data = index_data_owner.get();
291286

292287
// no need to populate index data: SLM will be populated with default
293288
// values
@@ -427,14 +422,8 @@ sycl::event topk_merge_impl(
427422
});
428423

429424
sycl::event cleanup_host_task_event =
430-
exec_q.submit([&](sycl::handler &cgh) {
431-
cgh.depends_on(write_topk_ev);
432-
const sycl::context &ctx = exec_q.get_context();
433-
434-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
435-
cgh.host_task(
436-
[ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
437-
});
425+
dpctl::tensor::alloc_utils::async_smart_free(
426+
exec_q, {write_topk_ev}, index_data_owner);
438427

439428
return cleanup_host_task_event;
440429
}
@@ -474,15 +463,13 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
474463

475464
const std::size_t total_nelems = iter_nelems * axis_nelems;
476465
const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
477-
IndexTy *workspace = sycl::malloc_device<IndexTy>(
478-
padded_total_nelems + total_nelems, exec_q);
466+
auto workspace_owner =
467+
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
468+
padded_total_nelems + total_nelems, exec_q);
479469

480-
IndexTy *tmp_tp = sycl::malloc_device<IndexTy>(total_nelems, exec_q);
481-
482-
if (nullptr == workspace || nullptr == tmp_tp) {
483-
throw std::runtime_error(
484-
"Not enough device memory for radix sort topk");
485-
}
470+
// get raw USM pointer
471+
IndexTy *workspace = workspace_owner.get();
472+
IndexTy *tmp_tp = workspace + padded_total_nelems;
486473

487474
using IdentityProjT = radix_sort_details::IdentityProj;
488475
using IndexedProjT =
@@ -536,17 +523,8 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
536523
});
537524
});
538525

539-
sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
540-
cgh.depends_on(write_topk_ev);
541-
542-
const sycl::context &ctx = exec_q.get_context();
543-
544-
using dpctl::tensor::alloc_utils::sycl_free_noexcept;
545-
cgh.host_task([ctx, workspace, tmp_tp] {
546-
sycl_free_noexcept(workspace, ctx);
547-
sycl_free_noexcept(tmp_tp, ctx);
548-
});
549-
});
526+
sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
527+
exec_q, {write_topk_ev}, workspace_owner);
550528

551529
return cleanup_ev;
552530
}

dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828

2929
#include <exception>
3030
#include <iostream>
31+
#include <memory>
32+
#include <stdexcept>
33+
#include <vector>
3134

3235
#include "sycl/sycl.hpp"
3336

@@ -73,11 +76,99 @@ void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept
7376
}
7477
}
7578

76-
template <typename T> void sycl_free_noexcept(T *ptr, sycl::queue &q) noexcept
79+
template <typename T>
80+
void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept
7781
{
7882
sycl_free_noexcept(ptr, q.get_context());
7983
}
8084

85+
class USMDeleter
86+
{
87+
private:
88+
sycl::context ctx_;
89+
90+
public:
91+
USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {}
92+
USMDeleter(const sycl::context &ctx) : ctx_(ctx) {}
93+
94+
template <typename T> void operator()(T *ptr) const
95+
{
96+
sycl_free_noexcept(ptr, ctx_);
97+
}
98+
};
99+
100+
template <typename T>
101+
std::unique_ptr<T, USMDeleter>
102+
smart_malloc(std::size_t count,
103+
const sycl::queue &q,
104+
sycl::usm::alloc kind,
105+
const sycl::property_list &propList = {})
106+
{
107+
T *ptr = sycl::malloc<T>(count, q, kind, propList);
108+
if (nullptr == ptr) {
109+
throw std::runtime_error("Unable to allocate device_memory");
110+
}
111+
112+
auto usm_deleter = USMDeleter(q);
113+
return std::unique_ptr<T, USMDeleter>(ptr, usm_deleter);
114+
}
115+
116+
template <typename T>
117+
std::unique_ptr<T, USMDeleter>
118+
smart_malloc_device(std::size_t count,
119+
const sycl::queue &q,
120+
const sycl::property_list &propList = {})
121+
{
122+
return smart_malloc<T>(count, q, sycl::usm::alloc::device, propList);
123+
}
124+
125+
template <typename T>
126+
std::unique_ptr<T, USMDeleter>
127+
smart_malloc_shared(std::size_t count,
128+
const sycl::queue &q,
129+
const sycl::property_list &propList = {})
130+
{
131+
return smart_malloc<T>(count, q, sycl::usm::alloc::shared, propList);
132+
}
133+
134+
template <typename T>
135+
std::unique_ptr<T, USMDeleter>
136+
smart_malloc_jost(std::size_t count,
137+
const sycl::queue &q,
138+
const sycl::property_list &propList = {})
139+
{
140+
return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
141+
}
142+
143+
template <typename... Args>
144+
sycl::event async_smart_free(sycl::queue &exec_q,
145+
const std::vector<sycl::event> &depends,
146+
Args &&...args)
147+
{
148+
constexpr std::size_t n = sizeof...(Args);
149+
150+
std::vector<void *> ptrs;
151+
ptrs.reserve(n);
152+
(ptrs.push_back(reinterpret_cast<void *>(args.get())), ...);
153+
154+
std::vector<USMDeleter> dels;
155+
dels.reserve(n);
156+
(dels.push_back(args.get_deleter()), ...);
157+
158+
sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) {
159+
cgh.depends_on(depends);
160+
161+
cgh.host_task([ptrs, dels]() {
162+
for (size_t i = 0; i < ptrs.size(); ++i) {
163+
dels[i](ptrs[i]);
164+
}
165+
});
166+
});
167+
(args.release(), ...);
168+
169+
return ht_e;
170+
}
171+
81172
} // end of namespace alloc_utils
82173
} // end of namespace tensor
83174
} // end of namespace dpctl

0 commit comments

Comments
 (0)