From a2640ab8ef254a9f6e8cdffc6b4aeb33f90e8ba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?SYCLo=C3=AFd?= <213797226+sycloid@users.noreply.github.com> Date: Thu, 25 Sep 2025 11:49:37 -0500 Subject: [PATCH] In order to resolve gh-2156 move definition of the mask_positions and _cumsum_1d functions to _tensor_accumulations_impl Changed Python scripts accordingly, as well as CMake scripts to add implementation cpp file to the list of source files for the _tensor_accumulations_impl MODULE library. Also moved find_package(Python) to find Module.Development component before pybind11 is being activated to resolve CMake warning. Incidentally, this change also results in reduced binary size and improved compilation tiles, since accumulation kernels are not being generated in duplicates (once for _tensor_ctor module, and once for _tensor_accumulation_impl module). --- CMakeLists.txt | 3 +++ dpctl/CMakeLists.txt | 2 +- dpctl/tensor/CMakeLists.txt | 1 + dpctl/tensor/_copy_utils.py | 9 ++++----- dpctl/tensor/_indexing_functions.py | 5 ++--- dpctl/tensor/_manipulation_functions.py | 5 +++-- dpctl/tensor/_set_functions.py | 2 +- .../libtensor/source/tensor_accumulation.cpp | 17 +++++++++++++++++ dpctl/tensor/libtensor/source/tensor_ctors.cpp | 14 ++------------ 9 files changed, 34 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 378043b8f2..475b6f96e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -114,6 +114,9 @@ install(DIRECTORY FILES_MATCHING REGEX "\\.h(pp)?$" ) +# find Python before enabling pybind11 +find_package(Python REQUIRED COMPONENTS Development.Module) + # Define CMAKE_INSTALL_xxx: LIBDIR, INCLUDEDIR include(GNUInstallDirs) diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index 1de0bbf77d..e2914f6394 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -1,4 +1,4 @@ -find_package(Python REQUIRED COMPONENTS Development.Module NumPy) +find_package(Python REQUIRED COMPONENTS NumPy) # -t is to only Cythonize sources with timestamps newer than existing CXX files (if present) # -w is to set working directory (and correctly set __pyx_f[] array of filenames) diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index b5a66b88f6..cbf4638613 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -171,6 +171,7 @@ set(_accumulator_sources ) set(_tensor_accumulation_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${_accumulator_sources} ) diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index 3af5ebbe19..4976b728b0 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -26,6 +26,7 @@ import dpctl.utils from dpctl.tensor._data_types import _get_dtype from dpctl.tensor._device import normalize_queue_device +from dpctl.tensor._tensor_accumulation_impl import mask_positions from dpctl.tensor._type_utils import _dtype_supported_by_device_impl from ._numpy_helper import normalize_axis_index @@ -792,7 +793,7 @@ def _extract_impl(ary, ary_mask, axis=0): exec_q = cumsum.sycl_queue _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events - mask_count = ti.mask_positions( + mask_count = mask_positions( ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs ) dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] @@ -828,9 +829,7 @@ def _nonzero_impl(ary): ) _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events - mask_count = ti.mask_positions( - ary, cumsum, sycl_queue=exec_q, depends=dep_evs - ) + mask_count = mask_positions(ary, cumsum, sycl_queue=exec_q, depends=dep_evs) indexes_dt = ti.default_device_index_type(exec_q.sycl_device) indexes = dpt.empty( (ary.ndim, mask_count), @@ -1050,7 +1049,7 @@ def _place_impl(ary, ary_mask, vals, axis=0): exec_q = cumsum.sycl_queue _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events - mask_count = ti.mask_positions( + mask_count = mask_positions( ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev ) expected_vals_shape = ( diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py index 4f04a6094c..a75c137f0b 100644 --- a/dpctl/tensor/_indexing_functions.py +++ b/dpctl/tensor/_indexing_functions.py @@ -20,6 +20,7 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti import dpctl.utils +from dpctl.tensor._tensor_accumulation_impl import mask_positions from ._copy_utils import ( _extract_impl, @@ -413,9 +414,7 @@ def place(arr, mask, vals): cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) _manager = dpctl.utils.SequentialOrderManager[exec_q] deps_ev = _manager.submitted_events - nz_count = ti.mask_positions( - mask, cumsum, sycl_queue=exec_q, depends=deps_ev - ) + nz_count = mask_positions(mask, cumsum, sycl_queue=exec_q, depends=deps_ev) if nz_count == 0: return if vals.size == 0: diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index 65b027e2ba..87c217fdc9 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -24,6 +24,7 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti import dpctl.utils as dputils +from dpctl.tensor._tensor_accumulation_impl import _cumsum_1d from ._copy_utils import _broadcast_strides from ._numpy_helper import normalize_axis_index, normalize_axis_tuple @@ -908,7 +909,7 @@ def repeat(x, repeats, /, *, axis=None): sycl_queue=exec_q, ) # _cumsum_1d synchronizes so `depends` ends here safely - res_axis_size = ti._cumsum_1d( + res_axis_size = _cumsum_1d( rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev] ) if axis is not None: @@ -940,7 +941,7 @@ def repeat(x, repeats, /, *, axis=None): usm_type=usm_type, sycl_queue=exec_q, ) - res_axis_size = ti._cumsum_1d( + res_axis_size = _cumsum_1d( repeats, cumsum, sycl_queue=exec_q, depends=dep_evs ) if axis is not None: diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py index 1d4c28b924..b5341e820b 100644 --- a/dpctl/tensor/_set_functions.py +++ b/dpctl/tensor/_set_functions.py @@ -26,6 +26,7 @@ _get_shape, _validate_dtype, ) +from ._tensor_accumulation_impl import mask_positions from ._tensor_elementwise_impl import _not_equal, _subtract from ._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, @@ -34,7 +35,6 @@ _linspace_step, _take, default_device_index_type, - mask_positions, ) from ._tensor_sorting_impl import ( _argsort_ascending, diff --git a/dpctl/tensor/libtensor/source/tensor_accumulation.cpp b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp index f1b6eedbd6..b3af48f9ce 100644 --- a/dpctl/tensor/libtensor/source/tensor_accumulation.cpp +++ b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp @@ -24,12 +24,29 @@ //===----------------------------------------------------------------------===// #include +#include +#include "accumulators.hpp" #include "accumulators/accumulators_common.hpp" namespace py = pybind11; +namespace py_int = dpctl::tensor::py_internal; + +using py_int::py_cumsum_1d; +using py_int::py_mask_positions; + PYBIND11_MODULE(_tensor_accumulation_impl, m) { + py_int::populate_mask_positions_dispatch_vectors(); + py_int::populate_cumsum_1d_dispatch_vectors(); + dpctl::tensor::py_internal::init_accumulator_functions(m); + + m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), + py::arg("cumsum"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); } diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp index 2dccc4e359..879da209c5 100644 --- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -105,12 +105,12 @@ using dpctl::tensor::py_internal::usm_ndarray_put; using dpctl::tensor::py_internal::usm_ndarray_take; using dpctl::tensor::py_internal::py_extract; -using dpctl::tensor::py_internal::py_mask_positions; +// using dpctl::tensor::py_internal::py_mask_positions; using dpctl::tensor::py_internal::py_nonzero; using dpctl::tensor::py_internal::py_place; /* ================= Repeat ====================*/ -using dpctl::tensor::py_internal::py_cumsum_1d; +// using dpctl::tensor::py_internal::py_cumsum_1d; using dpctl::tensor::py_internal::py_repeat_by_scalar; using dpctl::tensor::py_internal::py_repeat_by_sequence; @@ -158,9 +158,6 @@ void init_dispatch_vectors(void) populate_masked_extract_dispatch_vectors(); populate_masked_place_dispatch_vectors(); - populate_mask_positions_dispatch_vectors(); - - populate_cumsum_1d_dispatch_vectors(); init_repeat_dispatch_vectors(); init_clip_dispatch_vectors(); @@ -402,13 +399,6 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), - py::arg("cumsum"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - - m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list());