diff --git a/CHANGELOG.md b/CHANGELOG.md index 01c6375f8194..864fb2fb79aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.16.0] - 09/DD/2024 +## [0.16.0] - 10/14/2024 This release reaches an important milestone by making offloading fully asynchronous. Calls to `dpnp` submit tasks for execution to DPC++ runtime and return without waiting for execution of these tasks to finish. The sequential semantics a user comes to expect from execution of Python script is preserved though. In addition, this release completes implementation of `dpnp.fft` module and adds several new array manipulation, indexing and elementwise routines. Moreover, it adds support to build `dpnp` for Nvidia GPUs. @@ -120,7 +120,6 @@ In addition, this release completes implementation of `dpnp.fft` module and adds * Fixed a crash in `dpnp.choose` caused by missing control of releasing temporary allocated device memory [#2063](https://github.com/IntelPython/dpnp/pull/2063) * Resolved compilation warning and error while building in debug mode [#2066](https://github.com/IntelPython/dpnp/pull/2066) * Fixed an issue with asynchronous execution in `dpnp.fft` module [#2067](https://github.com/IntelPython/dpnp/pull/2067) -* Added a workaround to fix the incorrect result from `dpnp.matmul` computing on Lunar Lake or Arrow Lake Battlemage graphics [#2082](https://github.com/IntelPython/dpnp/pull/2082) ## [0.15.0] - 05/25/2024 diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp index a17ff03563ae..aa5ef52be9e2 100644 --- a/dpnp/backend/extensions/blas/blas_py.cpp +++ b/dpnp/backend/extensions/blas/blas_py.cpp @@ -127,13 +127,6 @@ PYBIND11_MODULE(_blas_impl, m) py::arg("resultC"), py::arg("depends") = py::list()); } - { - m.def("_is_lnl_bm_architecture", &blas_ns::_is_lnl_bm_architecture, - "Return ``True`` if SYCL device belongs to either Lunar Lake or " - "Battlemage G21 Intel GPU architecture", - py::arg("device")); - } - { m.def("_gemm_batch", &blas_ns::gemm_batch, "Call `gemm_batch` from OneMKL BLAS library to compute " diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp index 42487eb9286a..e7043af59d25 100644 --- a/dpnp/backend/extensions/blas/gemm.cpp +++ b/dpnp/backend/extensions/blas/gemm.cpp @@ -323,22 +323,6 @@ std::tuple return std::make_tuple(args_ev, gemm_ev, is_row_major); } -bool _is_lnl_bm_architecture(const sycl::device &dev) -{ -#if !defined(USE_ONEMKL_CUBLAS) - namespace syclex = sycl::ext::oneapi::experimental; - const auto arch = dev.get_info(); - switch (arch) { - case syclex::architecture::intel_gpu_lnl_m: /* Lunar Lake */ - case syclex::architecture::intel_gpu_bmg_g21: /* Battlemage G21 */ - return true; - default: - return false; - } -#endif // !defined(USE_ONEMKL_CUBLAS) - return false; -} - template struct GemmContigFactory { diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp index fed42755753e..ee14400ae254 100644 --- a/dpnp/backend/extensions/blas/gemm.hpp +++ b/dpnp/backend/extensions/blas/gemm.hpp @@ -39,8 +39,6 @@ extern std::tuple const dpctl::tensor::usm_ndarray &resultC, const std::vector &depends); -extern bool _is_lnl_bm_architecture(const sycl::device &dev); - extern std::tuple gemm_batch(sycl::queue &exec_q, const dpctl::tensor::usm_ndarray &matrixA, diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index f65ad3737b41..e15bd93d7bbd 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -894,34 +894,6 @@ def dpnp_matmul( ) _manager.add_event_pair(ht_ev, gemv_ev) elif call_flag == "gemm": - # MKLD-17976: due to known issue in OneMKL on Lunar Lake and - # Battlemage G21 Intel GPU architectures, it forces - # to implement a temporary workaround with extra copying of - # an input array in case when it has a small size and - # non-zero offset - # The issue was detected by failing tests for eig/eigh - # TODO: remove the workaround once OneMKL issue is resolved - if bi._is_lnl_bm_architecture(exec_q.get_sycl_device()): - - def _need_to_copy(a): - a_usm = dpnp.get_usm_ndarray(a) - if a_usm._element_offset > 0 and a_usm.size < 16: - return True - return False - - x1 = _copy_array( - x1, - copy_flag=_need_to_copy(x1), - dtype=compute_dtype, - order=res_order, - ) - x2 = _copy_array( - x2, - copy_flag=_need_to_copy(x2), - dtype=compute_dtype, - order=res_order, - ) - result = _gemm_matmul( exec_q, x1, diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py index dbaab556e0c2..74d61e798703 100644 --- a/tests/test_mathematical.py +++ b/tests/test_mathematical.py @@ -3824,6 +3824,24 @@ def test_matmul_alias(self): result2 = dpnp.linalg.matmul(a, b) assert_array_equal(result1, result2) + @pytest.mark.parametrize( + "sh1, sh2", + [ + ((2, 3, 3), (2, 3, 3)), + ((3, 3, 3, 3), (3, 3, 3, 3)), + ], + ids=["gemm", "gemm_batch"], + ) + def test_matmul_with_offsets(self, sh1, sh2): + size1, size2 = numpy.prod(sh1, dtype=int), numpy.prod(sh2, dtype=int) + a = numpy.random.randint(-5, 5, size1).reshape(sh1).astype("f8") + b = numpy.random.randint(-5, 5, size2).reshape(sh2).astype("f8") + ia, ib = dpnp.array(a), dpnp.array(b) + + result = ia[1] @ ib[1] + expected = a[1] @ b[1] + assert_array_equal(result, expected) + class TestMatmulInvalidCases: @pytest.mark.parametrize(