Implement getrs_batch lapack extension

vlad-perevezentsev · vlad-perevezentsev · commit 36d21df575be · 2025-10-07T03:28:05.000-07:00
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -37,6 +37,7 @@ set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getri_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrs.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/getrs_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/heevd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/heevd_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/orgqr.cpp
diff --git a/dpnp/backend/extensions/lapack/getrs.hpp b/dpnp/backend/extensions/lapack/getrs.hpp
@@ -40,5 +40,20 @@ extern std::pair<sycl::event, sycl::event>
           oneapi::mkl::transpose trans,
           const std::vector<sycl::event> &depends = {});
 
+extern std::pair<sycl::event, sycl::event>
+    getrs_batch(sycl::queue &exec_q,
+                const dpctl::tensor::usm_ndarray &a_array,
+                const dpctl::tensor::usm_ndarray &ipiv_array,
+                const dpctl::tensor::usm_ndarray &b_array,
+                oneapi::mkl::transpose trans,
+                std::int64_t n,
+                std::int64_t nrhs,
+                std::int64_t stride_a,
+                std::int64_t stride_ipiv,
+                std::int64_t stride_b,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends = {});
+
 extern void init_getrs_dispatch_vector(void);
+extern void init_getrs_batch_dispatch_vector(void);
 } // namespace dpnp::extensions::lapack
diff --git a/dpnp/backend/extensions/lapack/getrs_batch.cpp b/dpnp/backend/extensions/lapack/getrs_batch.cpp
@@ -0,0 +1,336 @@
+//*****************************************************************************
+// Copyright (c) 2025, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <cstddef>
+#include <stdexcept>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "getrs.hpp"
+#include "linalg_exceptions.hpp"
+#include "types_matrix.hpp"
+
+namespace dpnp::extensions::lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+typedef sycl::event (*getrs_batch_impl_fn_ptr_t)(
+    sycl::queue &,
+    oneapi::mkl::transpose, // trans
+    const std::int64_t,     // n
+    const std::int64_t,     // nrhs
+    char *,                 // a
+    std::int64_t,           // lda
+    std::int64_t,           // stride_a
+    std::int64_t *,         // ipiv
+    std::int64_t,           // stride_ipiv
+    char *,                 // b
+    std::int64_t,           // ldb
+    std::int64_t,           // stride_b
+    std::int64_t,           // batch_size
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+static getrs_batch_impl_fn_ptr_t getrs_batch_dispatch_vector[td_ns::num_types];
+
+template <typename T>
+static sycl::event getrs_batch_impl(sycl::queue &exec_q,
+                                    oneapi::mkl::transpose trans,
+                                    const std::int64_t n,
+                                    const std::int64_t nrhs,
+                                    char *in_a,
+                                    std::int64_t lda,
+                                    std::int64_t stride_a,
+                                    std::int64_t *ipiv,
+                                    std::int64_t stride_ipiv,
+                                    char *in_b,
+                                    std::int64_t ldb,
+                                    std::int64_t stride_b,
+                                    std::int64_t batch_size,
+                                    std::vector<sycl::event> &host_task_events,
+                                    const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *b = reinterpret_cast<T *>(in_b);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::getrs_batch_scratchpad_size<T>(exec_q, trans, n, nrhs, lda,
+                                                   stride_a, stride_ipiv, ldb,
+                                                   stride_b, batch_size);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event getrs_batch_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        getrs_batch_event = mkl_lapack::getrs_batch(
+            exec_q,
+            trans, // Specifies the operation: whether or not to transpose
+                   // matrix A. Can be 'N' for no transpose, 'T' for transpose,
+                   // and 'C' for conjugate transpose.
+            n,     // The order of the square matrix A
+                   // and the number of rows in matrix B (0 ≤ n).
+                   // It must be a non-negative integer.
+            nrhs,  // The number of right-hand sides,
+                   // i.e., the number of columns in matrix B (0 ≤ nrhs).
+            a,     // Pointer to the square matrix A (n x n).
+            lda,   // The leading dimension of matrix A, must be at least max(1,
+                   // n). It must be at least max(1, n).
+            stride_a, // Stride between consecutive A matrices in the batch.
+            ipiv, // Pointer to the output array of pivot indices that were used
+                  // during factorization (n, ).
+            stride_ipiv, // Stride between consecutive pivot arrays in the
+                         // batch.
+            b,   // Pointer to the matrix B of right-hand sides (ldb, nrhs).
+            ldb, // The leading dimension of matrix B, must be at least max(1,
+                 // n).
+            stride_b,   // Stride between consecutive B matrices in the batch.
+            batch_size, // Total number of matrices in the batch.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail();
+        }
+        else if (info > 0) {
+            is_exception_caught = false;
+            if (scratchpad != nullptr) {
+                dpctl::tensor::alloc_utils::sycl_free_noexcept(scratchpad,
+                                                               exec_q);
+            }
+            throw LinAlgError("The solve could not be completed.");
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during getrs() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg << "Unexpected SYCL exception caught during getrs() call:\n"
+                  << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            dpctl::tensor::alloc_utils::sycl_free_noexcept(scratchpad, exec_q);
+        }
+
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(getrs_batch_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() {
+            dpctl::tensor::alloc_utils::sycl_free_noexcept(scratchpad, ctx);
+        });
+    });
+    host_task_events.push_back(clean_up_event);
+    return getrs_batch_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    getrs_batch(sycl::queue &exec_q,
+                const dpctl::tensor::usm_ndarray &a_array,
+                const dpctl::tensor::usm_ndarray &ipiv_array,
+                const dpctl::tensor::usm_ndarray &b_array,
+                oneapi::mkl::transpose trans,
+                std::int64_t n,
+                std::int64_t nrhs,
+                std::int64_t stride_a,
+                std::int64_t stride_ipiv,
+                std::int64_t stride_b,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int b_array_nd = b_array.get_ndim();
+    const int ipiv_array_nd = ipiv_array.get_ndim();
+
+    if (a_array_nd < 3) {
+        throw py::value_error(
+            "The LU-factorized array has ndim=" + std::to_string(a_array_nd) +
+            ", but an array with ndim >= 3 is expected");
+    }
+    if (b_array_nd < 3) {
+        throw py::value_error("The right-hand sides array has ndim=" +
+                              std::to_string(b_array_nd) +
+                              ", but an array with ndim >= 3 is expected");
+    }
+    if (ipiv_array_nd < 1) {
+        throw py::value_error("The array of pivot indices has ndim=" +
+                              std::to_string(ipiv_array_nd) +
+                              ", but an array with ndim >= 2 is expected");
+    }
+
+    if (ipiv_array_nd != a_array_nd - 1) {
+        throw py::value_error(
+            "The array of pivot indices has ndim=" +
+            std::to_string(ipiv_array_nd) +
+            ", but an array with ndim=" + std::to_string(a_array_nd - 1) +
+            " is expected to match LU batch dimensions");
+    }
+
+    const py::ssize_t *a_array_shape = a_array.get_shape_raw();
+
+    if (a_array_shape[a_array_nd - 1] != a_array_shape[a_array_nd - 2]) {
+        throw py::value_error(
+            "The last two dimensions of the LU array must be square,"
+            " but got a shape of (" +
+            std::to_string(a_array_shape[a_array_nd - 1]) + ", " +
+            std::to_string(a_array_shape[a_array_nd - 2]) + ").");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q,
+                                             {a_array, b_array, ipiv_array}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, b_array)) {
+        throw py::value_error("The LU-factorized and right-hand sides arrays "
+                              "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_a_array_f_contig = a_array.is_f_contiguous();
+    bool is_b_array_f_contig = b_array.is_f_contiguous();
+    bool is_ipiv_array_c_contig = ipiv_array.is_c_contiguous();
+    bool is_ipiv_array_f_contig = ipiv_array.is_f_contiguous();
+    if (!is_a_array_c_contig && !is_a_array_f_contig) {
+        throw py::value_error("The LU-factorized array "
+                              "must be either C-contiguous "
+                              "or F-contiguous");
+    }
+    if (!is_b_array_f_contig) {
+        throw py::value_error("The right-hand sides array "
+                              "must be F-contiguous");
+    }
+    if (!is_ipiv_array_c_contig && !is_ipiv_array_f_contig) {
+        throw py::value_error("The array of pivot indices "
+                              "must be contiguous");
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int b_array_type_id =
+        array_types.typenum_to_lookup_id(b_array.get_typenum());
+
+    if (a_array_type_id != b_array_type_id) {
+        throw py::value_error("The types of the LU-factorized and "
+                              "right-hand sides arrays are mismatched");
+    }
+
+    getrs_batch_impl_fn_ptr_t getrs_batch_fn =
+        getrs_batch_dispatch_vector[a_array_type_id];
+    if (getrs_batch_fn == nullptr) {
+        throw py::value_error(
+            "No getrs_batch implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    auto ipiv_types = td_ns::usm_ndarray_types();
+    int ipiv_array_type_id =
+        ipiv_types.typenum_to_lookup_id(ipiv_array.get_typenum());
+
+    if (ipiv_array_type_id != static_cast<int>(td_ns::typenum_t::INT64)) {
+        throw py::value_error("The type of 'ipiv_array' must be int64.");
+    }
+
+    const std::int64_t lda = std::max<size_t>(1UL, n);
+    const std::int64_t ldb = std::max<size_t>(1UL, n);
+
+    char *a_array_data = a_array.get_data();
+    char *b_array_data = b_array.get_data();
+    char *ipiv_array_data = ipiv_array.get_data();
+
+    std::int64_t *ipiv = reinterpret_cast<std::int64_t *>(ipiv_array_data);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event getrs_batch_ev = getrs_batch_fn(
+        exec_q, trans, n, nrhs, a_array_data, lda, stride_a, ipiv, stride_ipiv,
+        b_array_data, ldb, stride_b, batch_size, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {a_array, b_array, ipiv_array}, host_task_events);
+
+    return std::make_pair(args_ev, getrs_batch_ev);
+}
+
+template <typename fnT, typename T>
+struct GetrsBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GetrsBatchTypePairSupportFactory<T>::is_defined) {
+            return getrs_batch_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_getrs_batch_dispatch_vector(void)
+{
+    td_ns::DispatchVectorBuilder<getrs_batch_impl_fn_ptr_t,
+                                 GetrsBatchContigFactory, td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(getrs_batch_dispatch_vector);
+}
+} // namespace dpnp::extensions::lapack
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -58,6 +58,7 @@ void init_dispatch_vectors(void)
     lapack_ext::init_getrf_batch_dispatch_vector();
     lapack_ext::init_getrf_dispatch_vector();
     lapack_ext::init_getri_batch_dispatch_vector();
+    lapack_ext::init_getrs_batch_dispatch_vector();
     lapack_ext::init_getrs_dispatch_vector();
     lapack_ext::init_orgqr_batch_dispatch_vector();
     lapack_ext::init_orgqr_dispatch_vector();
@@ -164,12 +165,22 @@ PYBIND11_MODULE(_lapack_impl, m)
 
     m.def("_getrs", &lapack_ext::getrs,
           "Call `getrs` from OneMKL LAPACK library to return "
-          "the solves of linear equations with an LU-factored "
+          "the solutions of linear equations with an LU-factored "
           "square coefficient matrix, with multiple right-hand sides",
           py::arg("sycl_queue"), py::arg("a_array"), py::arg("ipiv_array"),
           py::arg("b_array"), py::arg("trans") = oneapi::mkl::transpose::N,
           py::arg("depends") = py::list());
 
+    m.def("_getrs_batch", &lapack_ext::getrs_batch,
+          "Call `getrs_batch` from OneMKL LAPACK library to return "
+          "the solutions of batch linear equations with an LU-factored "
+          "square coefficient matrix, with multiple right-hand sides",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("ipiv_array"),
+          py::arg("b_array"), py::arg("trans") = oneapi::mkl::transpose::N,
+          py::arg("n"), py::arg("nrhs"), py::arg("stride_a"),
+          py::arg("stride_ipiv"), py::arg("stride_b"), py::arg("batch_size"),
+          py::arg("depends") = py::list());
+
     m.def("_orgqr_batch", &lapack_ext::orgqr_batch,
           "Call `_orgqr_batch` from OneMKL LAPACK library to return "
           "the real orthogonal matrix Qi of the QR factorization "
diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp