diff --git a/CMakeLists.txt b/CMakeLists.txt index 378043b8f2..475b6f96e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,9 @@ install(DIRECTORY FILES_MATCHING REGEX "\\.h(pp)?$" ) +# find Python before enabling pybind11 +find_package(Python REQUIRED COMPONENTS Development.Module) + # Define CMAKE_INSTALL_xxx: LIBDIR, INCLUDEDIR include(GNUInstallDirs) diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index 1de0bbf77d..e2914f6394 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -1,4 +1,4 @@ -find_package(Python REQUIRED COMPONENTS Development.Module NumPy) +find_package(Python REQUIRED COMPONENTS NumPy) # -t is to only Cythonize sources with timestamps newer than existing CXX files (if present) # -w is to set working directory (and correctly set __pyx_f[] array of filenames) diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index b5a66b88f6..cbf4638613 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -171,6 +171,7 @@ set(_accumulator_sources ) set(_tensor_accumulation_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${_accumulator_sources} ) diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index 3af5ebbe19..4976b728b0 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -26,6 +26,7 @@ import dpctl.utils from dpctl.tensor._data_types import _get_dtype from dpctl.tensor._device import normalize_queue_device +from dpctl.tensor._tensor_accumulation_impl import mask_positions from dpctl.tensor._type_utils import _dtype_supported_by_device_impl from ._numpy_helper import normalize_axis_index @@ -792,7 +793,7 @@ def _extract_impl(ary, ary_mask, axis=0): exec_q = cumsum.sycl_queue _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events - mask_count = ti.mask_positions( + mask_count = mask_positions( ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs ) dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] @@ -828,9 +829,7 @@ def _nonzero_impl(ary): ) _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events - mask_count = ti.mask_positions( - ary, cumsum, sycl_queue=exec_q, depends=dep_evs - ) + mask_count = mask_positions(ary, cumsum, sycl_queue=exec_q, depends=dep_evs) indexes_dt = ti.default_device_index_type(exec_q.sycl_device) indexes = dpt.empty( (ary.ndim, mask_count), @@ -1050,7 +1049,7 @@ def _place_impl(ary, ary_mask, vals, axis=0): exec_q = cumsum.sycl_queue _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events - mask_count = ti.mask_positions( + mask_count = mask_positions( ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev ) expected_vals_shape = ( diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 4f04a6094c..a75c137f0b 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -20,6 +20,7 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti import dpctl.utils +from dpctl.tensor._tensor_accumulation_impl import mask_positions from ._copy_utils import ( _extract_impl, @@ -413,9 +414,7 @@ def place(arr, mask, vals): cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) _manager = dpctl.utils.SequentialOrderManager[exec_q] deps_ev = _manager.submitted_events - nz_count = ti.mask_positions( - mask, cumsum, sycl_queue=exec_q, depends=deps_ev - ) + nz_count = mask_positions(mask, cumsum, sycl_queue=exec_q, depends=deps_ev) if nz_count == 0: return if vals.size == 0: diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index 65b027e2ba..87c217fdc9 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -24,6 +24,7 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti import dpctl.utils as dputils +from dpctl.tensor._tensor_accumulation_impl import _cumsum_1d from ._copy_utils import _broadcast_strides from ._numpy_helper import normalize_axis_index, normalize_axis_tuple @@ -908,7 +909,7 @@ def repeat(x, repeats, /, *, axis=None): sycl_queue=exec_q, ) # _cumsum_1d synchronizes so `depends` ends here safely - res_axis_size = ti._cumsum_1d( + res_axis_size = _cumsum_1d( rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev] ) if axis is not None: @@ -940,7 +941,7 @@ def repeat(x, repeats, /, *, axis=None): usm_type=usm_type, sycl_queue=exec_q, ) - res_axis_size = ti._cumsum_1d( + res_axis_size = _cumsum_1d( repeats, cumsum, sycl_queue=exec_q, depends=dep_evs ) if axis is not None: diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py index 1d4c28b924..b5341e820b 100644 --- a/dpctl/tensor/_set_functions.py +++ b/dpctl/tensor/_set_functions.py @@ -26,6 +26,7 @@ _get_shape, _validate_dtype, ) +from ._tensor_accumulation_impl import mask_positions from ._tensor_elementwise_impl import _not_equal, _subtract from ._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, @@ -34,7 +35,6 @@ _linspace_step, _take, default_device_index_type, - mask_positions, ) from ._tensor_sorting_impl import ( _argsort_ascending, diff --git a/dpctl/tensor/libtensor/source/tensor_accumulation.cpp b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp index f1b6eedbd6..b3af48f9ce 100644 --- a/dpctl/tensor/libtensor/source/tensor_accumulation.cpp +++ b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp @@ -24,12 +24,29 @@ //===----------------------------------------------------------------------===// #include +#include +#include "accumulators.hpp" #include "accumulators/accumulators_common.hpp" namespace py = pybind11; +namespace py_int = dpctl::tensor::py_internal; + +using py_int::py_cumsum_1d; +using py_int::py_mask_positions; + PYBIND11_MODULE(_tensor_accumulation_impl, m) { + py_int::populate_mask_positions_dispatch_vectors(); + py_int::populate_cumsum_1d_dispatch_vectors(); + dpctl::tensor::py_internal::init_accumulator_functions(m); + + m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), + py::arg("cumsum"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); } diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp index 2dccc4e359..879da209c5 100644 --- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -105,12 +105,12 @@ using dpctl::tensor::py_internal::usm_ndarray_put; using dpctl::tensor::py_internal::usm_ndarray_take; using dpctl::tensor::py_internal::py_extract; -using dpctl::tensor::py_internal::py_mask_positions; +// using dpctl::tensor::py_internal::py_mask_positions; using dpctl::tensor::py_internal::py_nonzero; using dpctl::tensor::py_internal::py_place; /* ================= Repeat ====================*/ -using dpctl::tensor::py_internal::py_cumsum_1d; +// using dpctl::tensor::py_internal::py_cumsum_1d; using dpctl::tensor::py_internal::py_repeat_by_scalar; using dpctl::tensor::py_internal::py_repeat_by_sequence; @@ -158,9 +158,6 @@ void init_dispatch_vectors(void) populate_masked_extract_dispatch_vectors(); populate_masked_place_dispatch_vectors(); - populate_mask_positions_dispatch_vectors(); - - populate_cumsum_1d_dispatch_vectors(); init_repeat_dispatch_vectors(); init_clip_dispatch_vectors(); @@ -402,13 +399,6 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), - py::arg("cumsum"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - - m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list());