diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp index fc067f4761..30cd7572ea 100644 --- a/dpctl/apis/include/dpctl4pybind11.hpp +++ b/dpctl/apis/include/dpctl4pybind11.hpp @@ -27,9 +27,9 @@ #include "dpctl_capi.h" #include -#include #include #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index c236688842..aec9863bf0 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp index b8f46e0c3b..0bf062a6b6 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -173,6 +173,7 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q, sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); + cgh.use_kernel_bundle(kb); const sycl::range<1> gRange{n_groups * lws}; const sycl::range<1> lRange{lws}; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index d2e2793319..ee955dcde5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -25,6 +25,7 @@ #pragma once #include #include +#include #include #include diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index 5d12e37f92..cbb079e3c5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -26,6 +26,7 @@ #pragma once #include #include +#include #include #include "kernels/alignment.hpp" diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp index 8d7e4a3f09..3eeec88f16 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp index 1026efcfe0..ef2769275b 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1349,6 +1350,8 @@ sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q, sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); + cgh.use_kernel_bundle(kb); + using LocAccT1 = sycl::local_accessor; LocAccT1 local_A_block(wg_delta_n * wi_delta_n * wi_delta_k, cgh); diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 10dbd2fa40..905b22795e 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl/tensor/libtensor/include/utils/output_validation.hpp index a124d63e97..01605dbce6 100644 --- a/dpctl/tensor/libtensor/include/utils/output_validation.hpp +++ b/dpctl/tensor/libtensor/include/utils/output_validation.hpp @@ -25,6 +25,8 @@ //===----------------------------------------------------------------------===// #pragma once +#include + #include "dpctl4pybind11.hpp" #include diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp index 6c4e04e86b..d6f621436d 100644 --- a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp @@ -23,6 +23,7 @@ //===----------------------------------------------------------------------===// #pragma once +#include #include "dpctl4pybind11.hpp" #include "type_dispatch_building.hpp" diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp index a007a91d0a..8207f7a68d 100644 --- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp @@ -24,7 +24,7 @@ #pragma once #include -#include +#include #include #include diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp index ee7d40b3a2..76c746ff35 100644 --- a/dpctl/tensor/libtensor/source/accumulators.cpp +++ b/dpctl/tensor/libtensor/source/accumulators.cpp @@ -22,15 +22,17 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include "dpctl4pybind11.hpp" #include #include -#include -#include +#include #include #include #include +#include "dpctl4pybind11.hpp" +#include +#include + #include "kernels/accumulators.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp index b405768a7e..eb34289549 100644 --- a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp @@ -24,16 +24,18 @@ #pragma once -#include "dpctl4pybind11.hpp" #include #include -#include -#include -#include +#include #include #include #include +#include "dpctl4pybind11.hpp" +#include +#include +#include + #include "kernels/accumulators.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index 7b17a41b0c..905a65a9a6 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -23,15 +23,17 @@ /// dpctl.tensor.extract, dpctl.tensor.nonzero //===----------------------------------------------------------------------===// -#include "dpctl4pybind11.hpp" #include #include -#include -#include +#include #include #include #include +#include "dpctl4pybind11.hpp" +#include +#include + #include "boolean_advanced_indexing.hpp" #include "kernels/boolean_advanced_indexing.hpp" #include "simplify_iteration_space.hpp" diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp index f5cad5d4ea..7688f5b61b 100644 --- a/dpctl/tensor/libtensor/source/clip.cpp +++ b/dpctl/tensor/libtensor/source/clip.cpp @@ -23,14 +23,16 @@ /// dpctl.tensor.clip //===----------------------------------------------------------------------===// -#include "dpctl4pybind11.hpp" #include #include +#include +#include +#include + +#include "dpctl4pybind11.hpp" #include #include #include -#include -#include #include "clip.hpp" #include "kernels/clip.hpp" diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index 6b10851869..42ac8f3cdb 100644 --- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -25,16 +25,17 @@ #include #include #include -#include -#include -#include -#include +#include #include #include #include #include #include "dpctl4pybind11.hpp" +#include +#include +#include +#include #include "kernels/copy_and_cast.hpp" #include "utils/memory_overlap.hpp" diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp index 6e271a2125..d76775f78c 100644 --- a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp @@ -22,18 +22,20 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// +#include #include #include #include -#include "copy_for_reshape.hpp" #include "dpctl4pybind11.hpp" +#include + +#include "copy_for_reshape.hpp" #include "kernels/copy_and_cast.hpp" #include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" -#include namespace dpctl { diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp index ea127f15b8..774228c6a7 100644 --- a/dpctl/tensor/libtensor/source/copy_for_roll.cpp +++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp @@ -22,18 +22,20 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// +#include #include #include #include -#include "copy_for_roll.hpp" #include "dpctl4pybind11.hpp" +#include + +#include "copy_for_roll.hpp" #include "kernels/copy_and_cast.hpp" #include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" -#include #include "simplify_iteration_space.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index bd60b28e4f..e9efa734ad 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -24,13 +24,15 @@ //===----------------------------------------------------------------------===// #pragma once +#include +#include +#include +#include + #include "dpctl4pybind11.hpp" #include #include #include -#include -#include -#include #include "elementwise_functions_type_utils.hpp" #include "kernels/alignment.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp index dd7168beb1..8231995868 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp @@ -23,16 +23,18 @@ /// specifically functions for elementwise operations. //===----------------------------------------------------------------------===// -#include "dpctl4pybind11.hpp" #include #include -#include -#include -#include +#include #include #include #include +#include "dpctl4pybind11.hpp" +#include +#include +#include + #include "elementwise_functions.hpp" #include "simplify_iteration_space.hpp" #include "true_divide.hpp" diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp index 27d8345bf6..4542598391 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -22,14 +22,16 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include "dpctl4pybind11.hpp" #include -#include -#include +#include #include #include #include +#include "dpctl4pybind11.hpp" +#include +#include + #include "kernels/constructors.hpp" #include "utils/output_validation.hpp" #include "utils/type_dispatch.hpp" diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp index 316d8c2dc0..f38fab8a2d 100644 --- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -26,13 +26,15 @@ #include #include #include -#include -#include -#include +#include #include #include #include "dpctl4pybind11.hpp" +#include +#include +#include + #include "kernels/integer_advanced_indexing.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp index e0fc581c10..dea278eb58 100644 --- a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp +++ b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp @@ -22,15 +22,17 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include "dpctl4pybind11.hpp" #include -#include -#include -#include +#include #include #include #include +#include "dpctl4pybind11.hpp" +#include +#include +#include + #include "dot.hpp" #include "dot_atomic_support.hpp" #include "dot_dispatch.hpp" diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index 6d5cf6668f..7076dc8827 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -27,6 +27,7 @@ #include #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index 2ea41dd49b..64a7ca3068 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -22,15 +22,17 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include "dpctl4pybind11.hpp" #include #include -#include -#include +#include #include #include #include +#include "dpctl4pybind11.hpp" +#include +#include + #include "kernels/repeat.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" diff --git a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp index b8e82a76ea..cf12601b1c 100644 --- a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp +++ b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp @@ -22,10 +22,12 @@ /// extension. //===--------------------------------------------------------------------===// +#include #include #include #include +#include "dpctl4pybind11.hpp" #include #include @@ -35,7 +37,6 @@ #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" #include "utils/type_utils.hpp" -#include #include "rich_comparisons.hpp" #include "simplify_iteration_space.hpp" diff --git a/dpctl/tensor/libtensor/source/sorting/sort.cpp b/dpctl/tensor/libtensor/source/sorting/sort.cpp index 756e60bdf1..d79555d602 100644 --- a/dpctl/tensor/libtensor/source/sorting/sort.cpp +++ b/dpctl/tensor/libtensor/source/sorting/sort.cpp @@ -22,10 +22,11 @@ /// extension. //===--------------------------------------------------------------------===// +#include + #include "dpctl4pybind11.hpp" #include #include -#include #include "utils/math_utils.hpp" #include "utils/memory_overlap.hpp" @@ -127,9 +128,7 @@ py_sort(const dpctl::tensor::usm_ndarray &src, bool is_dst_c_contig = dst.is_c_contiguous(); if (is_src_c_contig && is_dst_c_contig) { - using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; - - static constexpr py::ssize_t zero_offset = py::ssize_t(0); + constexpr py::ssize_t zero_offset = py::ssize_t(0); auto fn = stable_sort_contig_fns[src_typeid]; diff --git a/dpctl/tensor/libtensor/source/triul_ctor.cpp b/dpctl/tensor/libtensor/source/triul_ctor.cpp index c50ddbf373..e3c4a1420d 100644 --- a/dpctl/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl/tensor/libtensor/source/triul_ctor.cpp @@ -22,6 +22,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include diff --git a/dpctl/tensor/libtensor/source/where.cpp b/dpctl/tensor/libtensor/source/where.cpp index 6ece2f9a86..2d1cf040b4 100644 --- a/dpctl/tensor/libtensor/source/where.cpp +++ b/dpctl/tensor/libtensor/source/where.cpp @@ -23,14 +23,16 @@ /// dpctl.tensor.where //===----------------------------------------------------------------------===// -#include "dpctl4pybind11.hpp" #include #include +#include +#include +#include + +#include "dpctl4pybind11.hpp" #include #include #include -#include -#include #include "kernels/where.hpp" #include "simplify_iteration_space.hpp" diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp index e53a572efe..000f15d4e0 100644 --- a/dpctl/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl/tensor/libtensor/source/zeros_ctor.cpp @@ -22,14 +22,16 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include "dpctl4pybind11.hpp" #include -#include -#include +#include #include #include #include +#include "dpctl4pybind11.hpp" +#include +#include + #include "kernels/constructors.hpp" #include "utils/output_validation.hpp" #include "utils/type_dispatch.hpp"