In order to resolve gh-2156 move definition of the mask_positions

sycloid · sycloid · commit a2640ab8ef25 · 2025-09-26T11:46:08.000-05:00
and _cumsum_1d functions to _tensor_accumulations_impl

Changed Python scripts accordingly, as well as CMake scripts to
add implementation cpp file to the list of source files for the
_tensor_accumulations_impl MODULE library.

Also moved find_package(Python) to find Module.Development component
before pybind11 is being activated to resolve CMake warning.

Incidentally, this change also results in reduced binary size and improved
compilation tiles, since accumulation kernels are not being generated
in duplicates (once for _tensor_ctor module, and once for _tensor_accumulation_impl
module).
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -114,6 +114,9 @@ install(DIRECTORY
     FILES_MATCHING REGEX "\\.h(pp)?$"
 )
 
+# find Python before enabling pybind11
+find_package(Python REQUIRED COMPONENTS Development.Module)
+
 # Define CMAKE_INSTALL_xxx: LIBDIR, INCLUDEDIR
 include(GNUInstallDirs)
 
diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt
@@ -1,4 +1,4 @@
-find_package(Python REQUIRED COMPONENTS Development.Module NumPy)
+find_package(Python REQUIRED COMPONENTS NumPy)
 
 # -t is to only Cythonize sources with timestamps newer than existing CXX files (if present)
 # -w is to set working directory (and correctly set __pyx_f[] array of filenames)
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
@@ -171,6 +171,7 @@ set(_accumulator_sources
 )
 set(_tensor_accumulation_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${_accumulator_sources}
 )
 
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
@@ -26,6 +26,7 @@
 import dpctl.utils
 from dpctl.tensor._data_types import _get_dtype
 from dpctl.tensor._device import normalize_queue_device
+from dpctl.tensor._tensor_accumulation_impl import mask_positions
 from dpctl.tensor._type_utils import _dtype_supported_by_device_impl
 
 from ._numpy_helper import normalize_axis_index
@@ -792,7 +793,7 @@ def _extract_impl(ary, ary_mask, axis=0):
     exec_q = cumsum.sycl_queue
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
-    mask_count = ti.mask_positions(
+    mask_count = mask_positions(
         ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
     )
     dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
@@ -828,9 +829,7 @@ def _nonzero_impl(ary):
     )
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
-    mask_count = ti.mask_positions(
-        ary, cumsum, sycl_queue=exec_q, depends=dep_evs
-    )
+    mask_count = mask_positions(ary, cumsum, sycl_queue=exec_q, depends=dep_evs)
     indexes_dt = ti.default_device_index_type(exec_q.sycl_device)
     indexes = dpt.empty(
         (ary.ndim, mask_count),
@@ -1050,7 +1049,7 @@ def _place_impl(ary, ary_mask, vals, axis=0):
     exec_q = cumsum.sycl_queue
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_ev = _manager.submitted_events
-    mask_count = ti.mask_positions(
+    mask_count = mask_positions(
         ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev
     )
     expected_vals_shape = (
diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
@@ -20,6 +20,7 @@
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
 import dpctl.utils
+from dpctl.tensor._tensor_accumulation_impl import mask_positions
 
 from ._copy_utils import (
     _extract_impl,
@@ -413,9 +414,7 @@ def place(arr, mask, vals):
     cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
-    nz_count = ti.mask_positions(
-        mask, cumsum, sycl_queue=exec_q, depends=deps_ev
-    )
+    nz_count = mask_positions(mask, cumsum, sycl_queue=exec_q, depends=deps_ev)
     if nz_count == 0:
         return
     if vals.size == 0:
diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
@@ -24,6 +24,7 @@
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dputils
+from dpctl.tensor._tensor_accumulation_impl import _cumsum_1d
 
 from ._copy_utils import _broadcast_strides
 from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
@@ -908,7 +909,7 @@ def repeat(x, repeats, /, *, axis=None):
                 sycl_queue=exec_q,
             )
             # _cumsum_1d synchronizes so `depends` ends here safely
-            res_axis_size = ti._cumsum_1d(
+            res_axis_size = _cumsum_1d(
                 rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev]
             )
             if axis is not None:
@@ -940,7 +941,7 @@ def repeat(x, repeats, /, *, axis=None):
                 usm_type=usm_type,
                 sycl_queue=exec_q,
             )
-            res_axis_size = ti._cumsum_1d(
+            res_axis_size = _cumsum_1d(
                 repeats, cumsum, sycl_queue=exec_q, depends=dep_evs
             )
             if axis is not None:
diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py
@@ -26,6 +26,7 @@
     _get_shape,
     _validate_dtype,
 )
+from ._tensor_accumulation_impl import mask_positions
 from ._tensor_elementwise_impl import _not_equal, _subtract
 from ._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
@@ -34,7 +35,6 @@
     _linspace_step,
     _take,
     default_device_index_type,
-    mask_positions,
 )
 from ._tensor_sorting_impl import (
     _argsort_ascending,
diff --git a/dpctl/tensor/libtensor/source/tensor_accumulation.cpp b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp
@@ -24,12 +24,29 @@
 //===----------------------------------------------------------------------===//
 
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
+#include "accumulators.hpp"
 #include "accumulators/accumulators_common.hpp"
 
 namespace py = pybind11;
 
+namespace py_int = dpctl::tensor::py_internal;
+
+using py_int::py_cumsum_1d;
+using py_int::py_mask_positions;
+
 PYBIND11_MODULE(_tensor_accumulation_impl, m)
 {
+    py_int::populate_mask_positions_dispatch_vectors();
+    py_int::populate_cumsum_1d_dispatch_vectors();
+
     dpctl::tensor::py_internal::init_accumulator_functions(m);
+
+    m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
+          py::arg("cumsum"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 }
diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
@@ -105,12 +105,12 @@ using dpctl::tensor::py_internal::usm_ndarray_put;
 using dpctl::tensor::py_internal::usm_ndarray_take;
 
 using dpctl::tensor::py_internal::py_extract;
-using dpctl::tensor::py_internal::py_mask_positions;
+// using dpctl::tensor::py_internal::py_mask_positions;
 using dpctl::tensor::py_internal::py_nonzero;
 using dpctl::tensor::py_internal::py_place;
 
 /* ================= Repeat ====================*/
-using dpctl::tensor::py_internal::py_cumsum_1d;
+// using dpctl::tensor::py_internal::py_cumsum_1d;
 using dpctl::tensor::py_internal::py_repeat_by_scalar;
 using dpctl::tensor::py_internal::py_repeat_by_sequence;
 
@@ -158,9 +158,6 @@ void init_dispatch_vectors(void)
     populate_masked_extract_dispatch_vectors();
     populate_masked_place_dispatch_vectors();
 
-    populate_mask_positions_dispatch_vectors();
-
-    populate_cumsum_1d_dispatch_vectors();
     init_repeat_dispatch_vectors();
 
     init_clip_dispatch_vectors();
@@ -402,13 +399,6 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
-          py::arg("cumsum"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
     m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
           py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());

Original file line number	Diff line number	Diff line change
`@@ -114,6 +114,9 @@ install(DIRECTORY`
`114`	`114`	`FILES_MATCHING REGEX "\\.h(pp)?$"`
`115`	`115`	`)`
`116`	`116`
	`117`	`+# find Python before enabling pybind11`
	`118`	`+find_package(Python REQUIRED COMPONENTS Development.Module)`
	`119`	`+`
`117`	`120`	`# Define CMAKE_INSTALL_xxx: LIBDIR, INCLUDEDIR`
`118`	`121`	`include(GNUInstallDirs)`
`119`	`122`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-find_package(Python REQUIRED COMPONENTS Development.Module NumPy)`
	`1`	`+find_package(Python REQUIRED COMPONENTS NumPy)`
`2`	`2`
`3`	`3`	`# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present)`
`4`	`4`	`# -w is to set working directory (and correctly set __pyx_f[] array of filenames)`
Original file line number	Diff line number	Diff line change
`@@ -171,6 +171,7 @@ set(_accumulator_sources`
`171`	`171`	`)`
`172`	`172`	`set(_tensor_accumulation_impl_sources`
`173`	`173`	`${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp`
	`174`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp`
`174`	`175`	`${_accumulator_sources}`
`175`	`176`	`)`
`176`	`177`
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@`
`24`	`24`	`import dpctl.tensor as dpt`
`25`	`25`	`import dpctl.tensor._tensor_impl as ti`
`26`	`26`	`import dpctl.utils as dputils`
	`27`	`+from dpctl.tensor._tensor_accumulation_impl import _cumsum_1d`
`27`	`28`
`28`	`29`	`from ._copy_utils import _broadcast_strides`
`29`	`30`	`from ._numpy_helper import normalize_axis_index, normalize_axis_tuple`
`@@ -908,7 +909,7 @@ def repeat(x, repeats, /, *, axis=None):`
`908`	`909`	`sycl_queue=exec_q,`
`909`	`910`	`)`
`910`	`911`	# _cumsum_1d synchronizes so `depends` ends here safely
`911`		`- res_axis_size = ti._cumsum_1d(`
	`912`	`+ res_axis_size = _cumsum_1d(`
`912`	`913`	`rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev]`
`913`	`914`	`)`
`914`	`915`	`if axis is not None:`
`@@ -940,7 +941,7 @@ def repeat(x, repeats, /, *, axis=None):`
`940`	`941`	`usm_type=usm_type,`
`941`	`942`	`sycl_queue=exec_q,`
`942`	`943`	`)`
`943`		`- res_axis_size = ti._cumsum_1d(`
	`944`	`+ res_axis_size = _cumsum_1d(`
`944`	`945`	`repeats, cumsum, sycl_queue=exec_q, depends=dep_evs`
`945`	`946`	`)`
`946`	`947`	`if axis is not None:`