Update _batched_lu_factor_scipy by using single allocation with batch-axis views

vlad-perevezentsev · vlad-perevezentsev · commit 2087283689dd · 2025-08-26T04:43:27.000-07:00
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
@@ -402,9 +402,9 @@ def _batched_lu_factor_scipy(a, res_type):  # pylint: disable=too-many-locals
     """SciPy-compatible LU factorization for batched inputs."""
 
     # TODO: Find out at which array sizes the best performance is obtained
-    # getrf_batch implementation shows slow results with large arrays on GPU.
+    # getrf_batch can be slow on large GPU arrays.
     # Use getrf_batch only on CPU.
-    # On GPU call getrf for each two-dimensional array by loop
+    # On GPU fall back to calling getrf per 2D slice.
     use_batch = a.sycl_device.has_aspect_cpu
 
     a_sycl_queue = a.sycl_queue
@@ -416,7 +416,7 @@ def _batched_lu_factor_scipy(a, res_type):  # pylint: disable=too-many-locals
     orig_shape = a.shape
     batch_shape = orig_shape[:-2]
 
-    # accommodate empty arrays
+    # handle empty input
     if a.size == 0:
         lu = dpnp.empty_like(a)
         piv = dpnp.empty(
@@ -431,32 +431,33 @@ def _batched_lu_factor_scipy(a, res_type):  # pylint: disable=too-many-locals
     a = dpnp.reshape(a, (-1, m, n))
     batch_size = a.shape[0]
 
-    if use_batch:
-        # Reorder the elements by moving the last two axes of `a` to the front
-        # to match fortran-like array order which is assumed by getrf_batch
-        a = dpnp.moveaxis(a, 0, -1)
+    # Move batch axis to the end (m, n, batch) in Fortran order:
+    # required by getrf_batch
+    # and ensures each a[..., i] is F-contiguous for getrf
+    a = dpnp.moveaxis(a, 0, -1)
 
-        a_usm_arr = dpnp.get_usm_ndarray(a)
+    a_usm_arr = dpnp.get_usm_ndarray(a)
 
-        # `a` must be copied because getrf_batch destroys the input matrix
-        a_h = dpnp.empty_like(a, order="F", dtype=res_type)
-        ipiv_h = dpnp.empty(
-            (batch_size, k),
-            dtype=dpnp.int64,
-            order="C",
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
-        )
+    # `a` must be copied because getrf/getrf_batch destroys the input matrix
+    a_h = dpnp.empty_like(a, order="F", dtype=res_type)
+    ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr,
+        dst=a_h.get_array(),
+        sycl_queue=a_sycl_queue,
+        depends=_manager.submitted_events,
+    )
+    _manager.add_event_pair(ht_ev, copy_ev)
 
-        dev_info_h = [0] * batch_size
+    ipiv_h = dpnp.empty(
+        (batch_size, k),
+        dtype=dpnp.int64,
+        order="C",
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
 
-        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_usm_arr,
-            dst=a_h.get_array(),
-            sycl_queue=a_sycl_queue,
-            depends=_manager.submitted_events,
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
+    if use_batch:
+        dev_info_h = [0] * batch_size
 
         ipiv_stride = k
         a_stride = a_h.strides[-1]
@@ -477,12 +478,6 @@ def _batched_lu_factor_scipy(a, res_type):  # pylint: disable=too-many-locals
         )
         _manager.add_event_pair(ht_ev, getrf_ev)
 
-        # getrf_batch expects `a` in Fortran order and overwrites it.
-        # Batch was moved to the last axis before the call.
-        # Move it back to the front and reshape to the original shape.
-        a_h = dpnp.moveaxis(a_h, -1, 0).reshape(orig_shape)
-        ipiv_h = ipiv_h.reshape((*batch_shape, k))
-
         if any(dev_info_h):
             diag_nums = ", ".join(str(v) for v in dev_info_h if v > 0)
             warn(
@@ -491,77 +486,41 @@ def _batched_lu_factor_scipy(a, res_type):  # pylint: disable=too-many-locals
                 RuntimeWarning,
                 stacklevel=2,
             )
+    else:
+        dev_info_vecs = [[0] for _ in range(batch_size)]
+
+        # Sequential LU factorization using getrf per slice
+        for i in range(batch_size):
+            ht_ev, getrf_ev = li._getrf(
+                a_sycl_queue,
+                a_h[..., i].get_array(),
+                ipiv_h[i].get_array(),
+                dev_info_vecs[i],
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_ev, getrf_ev)
 
-        # MKL lapack uses 1-origin while SciPy uses 0-origin
-        ipiv_h -= 1
-
-        # Return a tuple containing the factorized matrix 'a_h',
-        # pivot indices 'ipiv_h'
-        return (a_h, ipiv_h)
-
-    a_usm_arr = dpnp.get_usm_ndarray(a)
-
-    # Initialize lists for storing arrays and events for each batch
-    a_vecs = [None] * batch_size
-    ipiv_vecs = [None] * batch_size
-    dev_info_vecs = [None] * batch_size
-    dep_evs = _manager.submitted_events
-
-    # Process each batch
-    for i in range(batch_size):
-        # Copy each 2D slice to a new array because getrf will destroy
-        # the input matrix
-        a_vecs[i] = dpnp.empty_like(a[i], order="F", dtype=res_type)
-        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_usm_arr[i],
-            dst=a_vecs[i].get_array(),
-            sycl_queue=a_sycl_queue,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-
-        ipiv_vecs[i] = dpnp.empty(
-            (k,),
-            dtype=dpnp.int64,
-            order="C",
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
+        diag_nums = ", ".join(
+            str(v) for info in dev_info_vecs for v in info if v > 0
         )
+        if diag_nums:
+            warn(
+                f"Diagonal number {diag_nums} are exactly zero. "
+                "Singular matrix.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
 
-        dev_info_vecs[i] = [0]
-
-        # Call the LAPACK extension function _getrf
-        # to perform LU decomposition on each batch in 'a_vecs[i]'
-        ht_ev, getrf_ev = li._getrf(
-            a_sycl_queue,
-            a_vecs[i].get_array(),
-            ipiv_vecs[i].get_array(),
-            dev_info_vecs[i],
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, getrf_ev)
-
-    # Reshape the results back to their original shape
-    out_a = dpnp.array(a_vecs).reshape(orig_shape)
-    out_ipiv = dpnp.array(ipiv_vecs).reshape((*batch_shape, k))
-
-    diag_nums = ", ".join(
-        str(v) for dev_info_h in dev_info_vecs for v in dev_info_h if v > 0
-    )
-
-    if diag_nums:
-        warn(
-            f"Diagonal number {diag_nums} are exactly zero. Singular matrix.",
-            RuntimeWarning,
-            stacklevel=2,
-        )
+    # Restore original shape: move batch axis back and reshape
+    a_h = dpnp.moveaxis(a_h, -1, 0).reshape(orig_shape)
+    ipiv_h = ipiv_h.reshape((*batch_shape, k))
 
-    # MKL lapack uses 1-origin while SciPy uses 0-origin
-    out_ipiv -= 1
+    # oneMKL LAPACK uses 1-origin while SciPy uses 0-origin
+    ipiv_h -= 1
 
-    # Return a tuple containing the factorized matrix 'out_a',
-    # pivot indices 'out_ipiv'
-    return (out_a, out_ipiv)
+    # Return a tuple containing the factorized matrix 'a_h',
+    # pivot indices 'ipiv_h'
+    return (a_h, ipiv_h)
 
 
 def _batched_solve(a, b, exec_q, res_usm_type, res_type):