From fe7ec77e8a4fabdd6c88e76405519da4d578d4f0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 2 Sep 2025 07:02:58 -0700 Subject: [PATCH 1/5] Optimize batched _lu_factor by using single allocation with batch-axis views --- dpnp/linalg/dpnp_utils_linalg.py | 103 ++++++++++--------------------- 1 file changed, 33 insertions(+), 70 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 2b8eef552aa..8e0f79c1168 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -297,26 +297,27 @@ def _batched_lu_factor(a, res_type): batch_size = a.shape[0] a_usm_arr = dpnp.get_usm_ndarray(a) + # `a` must be copied because getrf_batch destroys the input matrix + a_h = dpnp.empty_like(a, order="C", dtype=res_type) + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_h.get_array(), + sycl_queue=a_sycl_queue, + depends=_manager.submitted_events, + ) + _manager.add_event_pair(ht_ev, copy_ev) + + ipiv_h = dpnp.empty( + (batch_size, n), + dtype=dpnp.int64, + order="C", + usm_type=a_usm_type, + sycl_queue=a_sycl_queue, + ) + if use_batch: - # `a` must be copied because getrf_batch destroys the input matrix - a_h = dpnp.empty_like(a, order="C", dtype=res_type) - ipiv_h = dpnp.empty( - (batch_size, n), - dtype=dpnp.int64, - order="C", - usm_type=a_usm_type, - sycl_queue=a_sycl_queue, - ) dev_info_h = [0] * batch_size - ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr, - dst=a_h.get_array(), - sycl_queue=a_sycl_queue, - depends=_manager.submitted_events, - ) - _manager.add_event_pair(ht_ev, copy_ev) - ipiv_stride = n a_stride = a_h.strides[0] @@ -336,63 +337,25 @@ def _batched_lu_factor(a, res_type): ) _manager.add_event_pair(ht_ev, getrf_ev) - dev_info_array = dpnp.array( - dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue - ) - - # Reshape the results back to their original shape - a_h = a_h.reshape(orig_shape) - ipiv_h = ipiv_h.reshape(orig_shape[:-1]) - dev_info_array = dev_info_array.reshape(orig_shape[:-2]) - - return (a_h, ipiv_h, dev_info_array) - - # Initialize lists for storing arrays and events for each batch - a_vecs = [None] * batch_size - ipiv_vecs = [None] * batch_size - dev_info_vecs = [None] * batch_size - - dep_evs = _manager.submitted_events - - # Process each batch - for i in range(batch_size): - # Copy each 2D slice to a new array because getrf will destroy - # the input matrix - a_vecs[i] = dpnp.empty_like(a[i], order="C", dtype=res_type) - - ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr[i], - dst=a_vecs[i].get_array(), - sycl_queue=a_sycl_queue, - depends=dep_evs, - ) - _manager.add_event_pair(ht_ev, copy_ev) - - ipiv_vecs[i] = dpnp.empty( - (n,), - dtype=dpnp.int64, - order="C", - usm_type=a_usm_type, - sycl_queue=a_sycl_queue, - ) - dev_info_vecs[i] = [0] + else: + dev_info_h = [[0] for _ in range(batch_size)] - # Call the LAPACK extension function _getrf - # to perform LU decomposition on each batch in 'a_vecs[i]' - ht_ev, getrf_ev = li._getrf( - a_sycl_queue, - a_vecs[i].get_array(), - ipiv_vecs[i].get_array(), - dev_info_vecs[i], - depends=[copy_ev], - ) - _manager.add_event_pair(ht_ev, getrf_ev) + # Sequential LU factorization using getrf per slice + for i in range(batch_size): + ht_ev, getrf_ev = li._getrf( + a_sycl_queue, + a_h[i].get_array(), + ipiv_h[i].get_array(), + dev_info_h[i], + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, getrf_ev) # Reshape the results back to their original shape - out_a = dpnp.array(a_vecs, order="C").reshape(orig_shape) - out_ipiv = dpnp.array(ipiv_vecs).reshape(orig_shape[:-1]) + out_a = a_h.reshape(orig_shape) + out_ipiv = ipiv_h.reshape(orig_shape[:-1]) out_dev_info = dpnp.array( - dev_info_vecs, usm_type=a_usm_type, sycl_queue=a_sycl_queue + dev_info_h, usm_type=a_usm_type, sycl_queue=a_sycl_queue ).reshape(orig_shape[:-2]) return (out_a, out_ipiv, out_dev_info) From 3ce5ff093d1581ee57fbd97749bdda289e5c1c33 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 2 Sep 2025 07:13:52 -0700 Subject: [PATCH 2/5] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f3edc23d96..ceed069e2b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * FFT module is updated to perform in-place FFT in intermediate steps of ND FFT [#2543](https://github.com/IntelPython/dpnp/pull/2543) * Reused dpctl tensor include to enable experimental SYCL namespace for complex types [#2546](https://github.com/IntelPython/dpnp/pull/2546) * Changed Windows-specific logic in dpnp initialization [#2553](https://github.com/IntelPython/dpnp/pull/2553) +* Improved performance of `dpnp.linalg.det` and `dpnp.linalg.slogdet` for batched GPU inputs [#2572](https://github.com/IntelPython/dpnp/pull/2572) ### Deprecated From d3de917b9fc8ea6a0ff3b9c914fc36f7454654bc Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 3 Sep 2025 03:16:02 -0700 Subject: [PATCH 3/5] qwe --- perf.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 perf.py diff --git a/perf.py b/perf.py new file mode 100644 index 00000000000..246d9e77503 --- /dev/null +++ b/perf.py @@ -0,0 +1,30 @@ +import dpnp +import numpy as np +from dpnp.tests.helper import generate_random_numpy_array +import time +from IPython import get_ipython + +ipython = get_ipython() +if ipython is None: + from IPython.terminal.interactiveshell import TerminalInteractiveShell + ipython = TerminalInteractiveShell() + + +dtypes = ['f4', 'f8', 'c8', 'c16'] +n = 256 +print(f"size: ({n},{n},{n}) ") +for dtype in dtypes: + print(f"\n=== dtype: {dtype} ===") + a = generate_random_numpy_array((n,n,n), dtype=dtype, seed_value=81) + + # dpnp arrays on GPU + a_dp = dpnp.array(a, device='gpu') + exec_q = a_dp.sycl_queue + + # Cold run + _ = dpnp.linalg.slogdet(a_dp) + exec_q.wait() + + time.sleep(1) + print("DPNP (GPU, Old):") + ipython.run_line_magic('timeit', 'dpnp.linalg.slogdet(a_dp); exec_q.wait()') From c7b72619d6e10fb710f70931c7898c1bcd12940c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 8 Sep 2025 03:24:38 -0700 Subject: [PATCH 4/5] Apply remarks --- CHANGELOG.md | 2 +- dpnp/linalg/dpnp_utils_linalg.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f220c38d93d..214da78db3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Refactored backend implementation of `dpnp.linalg.solve` to use oneMKL LAPACK `gesv` directly [#2558](https://github.com/IntelPython/dpnp/pull/2558) * Improved performance of `dpnp.isclose` function by implementing a dedicated kernel for scalar `rtol` and `atol` arguments [#2540](https://github.com/IntelPython/dpnp/pull/2540) * Extended `dpnp.pad` to support `pad_width` keyword as a dictionary [#2535](https://github.com/IntelPython/dpnp/pull/2535) -* Improved performance of `dpnp.linalg.det` and `dpnp.linalg.slogdet` for batched GPU inputs [#2572](https://github.com/IntelPython/dpnp/pull/2572) +* Improved performance of batched implementation of `dpnp.linalg.det` and `dpnp.linalg.slogdet` [#2572](https://github.com/IntelPython/dpnp/pull/2572) ### Deprecated diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index ddbd211eed4..bb2920e3a99 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -297,7 +297,7 @@ def _batched_lu_factor(a, res_type): batch_size = a.shape[0] a_usm_arr = dpnp.get_usm_ndarray(a) - # `a` must be copied because getrf_batch destroys the input matrix + # `a` must be copied because getrf/getrf_batch destroys the input matrix a_h = dpnp.empty_like(a, order="C", dtype=res_type) ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_usm_arr, From 4e08d07352e23519b0aac4220c7aea0ce348c404 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 8 Sep 2025 04:03:41 -0700 Subject: [PATCH 5/5] Remove perf.py --- perf.py | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 perf.py diff --git a/perf.py b/perf.py deleted file mode 100644 index 246d9e77503..00000000000 --- a/perf.py +++ /dev/null @@ -1,30 +0,0 @@ -import dpnp -import numpy as np -from dpnp.tests.helper import generate_random_numpy_array -import time -from IPython import get_ipython - -ipython = get_ipython() -if ipython is None: - from IPython.terminal.interactiveshell import TerminalInteractiveShell - ipython = TerminalInteractiveShell() - - -dtypes = ['f4', 'f8', 'c8', 'c16'] -n = 256 -print(f"size: ({n},{n},{n}) ") -for dtype in dtypes: - print(f"\n=== dtype: {dtype} ===") - a = generate_random_numpy_array((n,n,n), dtype=dtype, seed_value=81) - - # dpnp arrays on GPU - a_dp = dpnp.array(a, device='gpu') - exec_q = a_dp.sycl_queue - - # Cold run - _ = dpnp.linalg.slogdet(a_dp) - exec_q.wait() - - time.sleep(1) - print("DPNP (GPU, Old):") - ipython.run_line_magic('timeit', 'dpnp.linalg.slogdet(a_dp); exec_q.wait()')