From fe7ec77e8a4fabdd6c88e76405519da4d578d4f0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 2 Sep 2025 07:02:58 -0700
Subject: [PATCH 1/5] Optimize batched _lu_factor by using single allocation
 with batch-axis views

---
 dpnp/linalg/dpnp_utils_linalg.py | 103 ++++++++++---------------------
 1 file changed, 33 insertions(+), 70 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 2b8eef552aa..8e0f79c1168 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -297,26 +297,27 @@ def _batched_lu_factor(a, res_type):
     batch_size = a.shape[0]
     a_usm_arr = dpnp.get_usm_ndarray(a)
 
+    # `a` must be copied because getrf_batch destroys the input matrix
+    a_h = dpnp.empty_like(a, order="C", dtype=res_type)
+    ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr,
+        dst=a_h.get_array(),
+        sycl_queue=a_sycl_queue,
+        depends=_manager.submitted_events,
+    )
+    _manager.add_event_pair(ht_ev, copy_ev)
+
+    ipiv_h = dpnp.empty(
+        (batch_size, n),
+        dtype=dpnp.int64,
+        order="C",
+        usm_type=a_usm_type,
+        sycl_queue=a_sycl_queue,
+    )
+
     if use_batch:
-        # `a` must be copied because getrf_batch destroys the input matrix
-        a_h = dpnp.empty_like(a, order="C", dtype=res_type)
-        ipiv_h = dpnp.empty(
-            (batch_size, n),
-            dtype=dpnp.int64,
-            order="C",
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
-        )
         dev_info_h = [0] * batch_size
 
-        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_usm_arr,
-            dst=a_h.get_array(),
-            sycl_queue=a_sycl_queue,
-            depends=_manager.submitted_events,
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-
         ipiv_stride = n
         a_stride = a_h.strides[0]
 
@@ -336,63 +337,25 @@ def _batched_lu_factor(a, res_type):
         )
         _manager.add_event_pair(ht_ev, getrf_ev)
 
-        dev_info_array = dpnp.array(
-            dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue
-        )
-
-        # Reshape the results back to their original shape
-        a_h = a_h.reshape(orig_shape)
-        ipiv_h = ipiv_h.reshape(orig_shape[:-1])
-        dev_info_array = dev_info_array.reshape(orig_shape[:-2])
-
-        return (a_h, ipiv_h, dev_info_array)
-
-    # Initialize lists for storing arrays and events for each batch
-    a_vecs = [None] * batch_size
-    ipiv_vecs = [None] * batch_size
-    dev_info_vecs = [None] * batch_size
-
-    dep_evs = _manager.submitted_events
-
-    # Process each batch
-    for i in range(batch_size):
-        # Copy each 2D slice to a new array because getrf will destroy
-        # the input matrix
-        a_vecs[i] = dpnp.empty_like(a[i], order="C", dtype=res_type)
-
-        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_usm_arr[i],
-            dst=a_vecs[i].get_array(),
-            sycl_queue=a_sycl_queue,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-
-        ipiv_vecs[i] = dpnp.empty(
-            (n,),
-            dtype=dpnp.int64,
-            order="C",
-            usm_type=a_usm_type,
-            sycl_queue=a_sycl_queue,
-        )
-        dev_info_vecs[i] = [0]
+    else:
+        dev_info_h = [[0] for _ in range(batch_size)]
 
-        # Call the LAPACK extension function _getrf
-        # to perform LU decomposition on each batch in 'a_vecs[i]'
-        ht_ev, getrf_ev = li._getrf(
-            a_sycl_queue,
-            a_vecs[i].get_array(),
-            ipiv_vecs[i].get_array(),
-            dev_info_vecs[i],
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, getrf_ev)
+        # Sequential LU factorization using getrf per slice
+        for i in range(batch_size):
+            ht_ev, getrf_ev = li._getrf(
+                a_sycl_queue,
+                a_h[i].get_array(),
+                ipiv_h[i].get_array(),
+                dev_info_h[i],
+                depends=[copy_ev],
+            )
+            _manager.add_event_pair(ht_ev, getrf_ev)
 
     # Reshape the results back to their original shape
-    out_a = dpnp.array(a_vecs, order="C").reshape(orig_shape)
-    out_ipiv = dpnp.array(ipiv_vecs).reshape(orig_shape[:-1])
+    out_a = a_h.reshape(orig_shape)
+    out_ipiv = ipiv_h.reshape(orig_shape[:-1])
     out_dev_info = dpnp.array(
-        dev_info_vecs, usm_type=a_usm_type, sycl_queue=a_sycl_queue
+        dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue
     ).reshape(orig_shape[:-2])
 
     return (out_a, out_ipiv, out_dev_info)

From 3ce5ff093d1581ee57fbd97749bdda289e5c1c33 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 2 Sep 2025 07:13:52 -0700
Subject: [PATCH 2/5] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f3edc23d96..ceed069e2b3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * FFT module is updated to perform in-place FFT in intermediate steps of ND FFT [#2543](https://github.com/IntelPython/dpnp/pull/2543)
 * Reused dpctl tensor include to enable experimental SYCL namespace for complex types [#2546](https://github.com/IntelPython/dpnp/pull/2546)
 * Changed Windows-specific logic in dpnp initialization [#2553](https://github.com/IntelPython/dpnp/pull/2553)
+* Improved performance of `dpnp.linalg.det` and `dpnp.linalg.slogdet` for batched GPU inputs [#2572](https://github.com/IntelPython/dpnp/pull/2572)
 
 ### Deprecated
 

From d3de917b9fc8ea6a0ff3b9c914fc36f7454654bc Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 3 Sep 2025 03:16:02 -0700
Subject: [PATCH 3/5] qwe

---
 perf.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 perf.py

diff --git a/perf.py b/perf.py
new file mode 100644
index 00000000000..246d9e77503
--- /dev/null
+++ b/perf.py
@@ -0,0 +1,30 @@
+import dpnp
+import numpy as np
+from dpnp.tests.helper import generate_random_numpy_array
+import time
+from IPython import get_ipython
+
+ipython = get_ipython()
+if ipython is None:
+    from IPython.terminal.interactiveshell import TerminalInteractiveShell
+    ipython = TerminalInteractiveShell()
+
+
+dtypes = ['f4', 'f8', 'c8', 'c16']
+n = 256
+print(f"size: ({n},{n},{n}) ")
+for dtype in dtypes:
+    print(f"\n=== dtype: {dtype} ===")
+    a = generate_random_numpy_array((n,n,n), dtype=dtype, seed_value=81)
+
+    # dpnp arrays on GPU
+    a_dp = dpnp.array(a, device='gpu')
+    exec_q = a_dp.sycl_queue
+
+    # Cold run
+    _ = dpnp.linalg.slogdet(a_dp)
+    exec_q.wait()
+
+    time.sleep(1)
+    print("DPNP (GPU, Old):")
+    ipython.run_line_magic('timeit', 'dpnp.linalg.slogdet(a_dp); exec_q.wait()')

From c7b72619d6e10fb710f70931c7898c1bcd12940c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 8 Sep 2025 03:24:38 -0700
Subject: [PATCH 4/5] Apply remarks

---
 CHANGELOG.md                     | 2 +-
 dpnp/linalg/dpnp_utils_linalg.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f220c38d93d..214da78db3d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * Refactored backend implementation of `dpnp.linalg.solve` to use oneMKL LAPACK `gesv` directly [#2558](https://github.com/IntelPython/dpnp/pull/2558)
 * Improved performance of `dpnp.isclose` function by implementing a dedicated kernel for scalar `rtol` and `atol` arguments [#2540](https://github.com/IntelPython/dpnp/pull/2540)
 * Extended `dpnp.pad` to support `pad_width` keyword as a dictionary [#2535](https://github.com/IntelPython/dpnp/pull/2535)
-* Improved performance of `dpnp.linalg.det` and `dpnp.linalg.slogdet` for batched GPU inputs [#2572](https://github.com/IntelPython/dpnp/pull/2572)
+* Improved performance of batched implementation of `dpnp.linalg.det` and `dpnp.linalg.slogdet` [#2572](https://github.com/IntelPython/dpnp/pull/2572)
 
 ### Deprecated
 
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index ddbd211eed4..bb2920e3a99 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -297,7 +297,7 @@ def _batched_lu_factor(a, res_type):
     batch_size = a.shape[0]
     a_usm_arr = dpnp.get_usm_ndarray(a)
 
-    # `a` must be copied because getrf_batch destroys the input matrix
+    # `a` must be copied because getrf/getrf_batch destroys the input matrix
     a_h = dpnp.empty_like(a, order="C", dtype=res_type)
     ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
         src=a_usm_arr,

From 4e08d07352e23519b0aac4220c7aea0ce348c404 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 8 Sep 2025 04:03:41 -0700
Subject: [PATCH 5/5] Remove perf.py

---
 perf.py | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 perf.py

diff --git a/perf.py b/perf.py
deleted file mode 100644
index 246d9e77503..00000000000
--- a/perf.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import dpnp
-import numpy as np
-from dpnp.tests.helper import generate_random_numpy_array
-import time
-from IPython import get_ipython
-
-ipython = get_ipython()
-if ipython is None:
-    from IPython.terminal.interactiveshell import TerminalInteractiveShell
-    ipython = TerminalInteractiveShell()
-
-
-dtypes = ['f4', 'f8', 'c8', 'c16']
-n = 256
-print(f"size: ({n},{n},{n}) ")
-for dtype in dtypes:
-    print(f"\n=== dtype: {dtype} ===")
-    a = generate_random_numpy_array((n,n,n), dtype=dtype, seed_value=81)
-
-    # dpnp arrays on GPU
-    a_dp = dpnp.array(a, device='gpu')
-    exec_q = a_dp.sycl_queue
-
-    # Cold run
-    _ = dpnp.linalg.slogdet(a_dp)
-    exec_q.wait()
-
-    time.sleep(1)
-    print("DPNP (GPU, Old):")
-    ipython.run_line_magic('timeit', 'dpnp.linalg.slogdet(a_dp); exec_q.wait()')