Skip to content

Commit 535e471

Browse files
Remove make_owner, instroduce smart_malloc_device, etc.
The smart_malloc_device<T>(count, q) makes USM allocation and returns a unique_ptr<T, USMDeleter> which owns the allocation. The function throws an exception (std::runtime_error) if USM allocation is not successful. The usage is as follows: ``` auto alloc_owner = smart_malloc_device<T>(count, q); T *data = alloc_owner.get(); [..SNIP..] <submit host task to deallocate memory> // release ownership of USM memory from smart pointer // since it is now managed by the host task which was // successfully submitted. alloc_owner.release(); [...SNIP...] ```
1 parent 51ead2b commit 535e471

File tree

3 files changed

+63
-43
lines changed

3 files changed

+63
-43
lines changed

dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
#include <array>
3131
#include <cstdint>
3232
#include <limits>
33-
#include <memory>
3433
#include <stdexcept>
3534
#include <type_traits>
3635
#include <utility>
@@ -1589,15 +1588,12 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
15891588
using CountT = std::uint32_t;
15901589

15911590
// memory for storing count and offset values
1592-
CountT *count_ptr =
1593-
sycl::malloc_device<CountT>(n_iters * n_counts, exec_q);
1591+
auto count_owner =
1592+
dpctl::tensor::alloc_utils::smart_malloc_device<CountT>(
1593+
n_iters * n_counts, exec_q);
15941594

1595-
if (nullptr == count_ptr) {
1596-
throw std::runtime_error("Could not allocate USM-device memory");
1597-
}
1595+
CountT *count_ptr = count_owner.get();
15981596

1599-
auto count_owner =
1600-
dpctl::tensor::alloc_utils::make_owner(count_ptr, exec_q);
16011597
constexpr std::uint32_t zero_radix_iter{0};
16021598

16031599
if constexpr (std::is_same_v<KeyT, bool>) {
@@ -1621,14 +1617,11 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
16211617
return sort_ev;
16221618
}
16231619

1624-
ValueT *tmp_arr =
1625-
sycl::malloc_device<ValueT>(n_iters * n_to_sort, exec_q);
1626-
if (nullptr == tmp_arr) {
1627-
throw std::runtime_error("Could not allocate USM-device memory");
1628-
}
1629-
16301620
auto tmp_arr_owner =
1631-
dpctl::tensor::alloc_utils::make_owner(tmp_arr, exec_q);
1621+
dpctl::tensor::alloc_utils::smart_malloc_device<ValueT>(
1622+
n_iters * n_to_sort, exec_q);
1623+
1624+
ValueT *tmp_arr = tmp_arr_owner.get();
16321625

16331626
// iterations per each bucket
16341627
assert("Number of iterations must be even" && radix_iters % 2 == 0);
@@ -1776,13 +1769,12 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
17761769
reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
17771770

17781771
const std::size_t total_nelems = iter_nelems * sort_nelems;
1779-
IndexTy *workspace = sycl::malloc_device<IndexTy>(total_nelems, exec_q);
1780-
1781-
if (nullptr == workspace) {
1782-
throw std::runtime_error("Could not allocate workspace on device");
1783-
}
17841772
auto workspace_owner =
1785-
dpctl::tensor::alloc_utils::make_owner(workspace, exec_q);
1773+
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(total_nelems,
1774+
exec_q);
1775+
1776+
// get raw USM pointer
1777+
IndexTy *workspace = workspace_owner.get();
17861778

17871779
using IdentityProjT = radix_sort_details::IdentityProj;
17881780
using IndexedProjT =

dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
#include <cstdint>
3030
#include <iterator>
3131
#include <limits>
32-
#include <memory>
3332
#include <stdexcept>
3433
#include <vector>
3534

@@ -92,13 +91,11 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
9291
const CompT &comp,
9392
const std::vector<sycl::event> &depends)
9493
{
95-
IndexTy *index_data =
96-
sycl::malloc_device<IndexTy>(iter_nelems * axis_nelems, exec_q);
97-
if (index_data == nullptr) {
98-
throw std::runtime_error("Unable to allocate device_memory");
99-
}
10094
auto index_data_owner =
101-
dpctl::tensor::alloc_utils::make_owner(index_data, exec_q);
95+
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
96+
iter_nelems * axis_nelems, exec_q);
97+
// extract USM pointer
98+
IndexTy *index_data = index_data_owner.get();
10299

103100
using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy, CompT>;
104101

@@ -288,13 +285,11 @@ sycl::event topk_merge_impl(
288285
index_comp, depends);
289286
}
290287

291-
IndexTy *index_data =
292-
sycl::malloc_device<IndexTy>(iter_nelems * alloc_len, exec_q);
293-
if (index_data == nullptr) {
294-
throw std::runtime_error("Unable to allocate device_memory");
295-
}
296288
auto index_data_owner =
297-
dpctl::tensor::alloc_utils::make_owner(index_data, exec_q);
289+
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
290+
iter_nelems * alloc_len, exec_q);
291+
// get raw USM pointer
292+
IndexTy *index_data = index_data_owner.get();
298293

299294
// no need to populate index data: SLM will be populated with default
300295
// values
@@ -482,16 +477,12 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
482477

483478
const std::size_t total_nelems = iter_nelems * axis_nelems;
484479
const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
485-
IndexTy *workspace = sycl::malloc_device<IndexTy>(
486-
padded_total_nelems + total_nelems, exec_q);
487-
488-
if (nullptr == workspace) {
489-
throw std::runtime_error(
490-
"Not enough device memory for radix sort topk");
491-
}
492480
auto workspace_owner =
493-
dpctl::tensor::alloc_utils::make_owner(workspace, exec_q);
481+
dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
482+
padded_total_nelems + total_nelems, exec_q);
494483

484+
// get raw USM pointer
485+
IndexTy *workspace = workspace_owner.get();
495486
IndexTy *tmp_tp = workspace + padded_total_nelems;
496487

497488
using IdentityProjT = radix_sort_details::IdentityProj;

dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <exception>
3030
#include <iostream>
3131
#include <memory>
32+
#include <stdexcept>
3233

3334
#include "sycl/sycl.hpp"
3435

@@ -95,12 +96,48 @@ class USMDeleter
9596
};
9697

9798
template <typename T>
98-
std::unique_ptr<T, USMDeleter> make_owner(T *ptr, const sycl::queue &q)
99+
std::unique_ptr<T, USMDeleter>
100+
smart_malloc(std::size_t count,
101+
const sycl::queue &q,
102+
sycl::usm::alloc kind,
103+
const sycl::property_list &propList = {})
99104
{
105+
T *ptr = sycl::malloc<T>(count, q, kind, propList);
106+
if (nullptr == ptr) {
107+
throw std::runtime_error("Unable to allocate device_memory");
108+
}
109+
100110
auto usm_deleter = USMDeleter(q);
101111
return std::unique_ptr<T, USMDeleter>(ptr, usm_deleter);
102112
}
103113

114+
template <typename T>
115+
std::unique_ptr<T, USMDeleter>
116+
smart_malloc_device(std::size_t count,
117+
const sycl::queue &q,
118+
const sycl::property_list &propList = {})
119+
{
120+
return smart_malloc<T>(count, q, sycl::usm::alloc::device, propList);
121+
}
122+
123+
template <typename T>
124+
std::unique_ptr<T, USMDeleter>
125+
smart_malloc_shared(std::size_t count,
126+
const sycl::queue &q,
127+
const sycl::property_list &propList = {})
128+
{
129+
return smart_malloc<T>(count, q, sycl::usm::alloc::shared, propList);
130+
}
131+
132+
template <typename T>
133+
std::unique_ptr<T, USMDeleter>
134+
smart_malloc_jost(std::size_t count,
135+
const sycl::queue &q,
136+
const sycl::property_list &propList = {})
137+
{
138+
return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
139+
}
140+
104141
} // end of namespace alloc_utils
105142
} // end of namespace tensor
106143
} // end of namespace dpctl

0 commit comments

Comments
 (0)