diff --git a/.github/workflows/pr-arm.yml b/.github/workflows/pr-arm.yml new file mode 100644 index 000000000..fcd851df1 --- /dev/null +++ b/.github/workflows/pr-arm.yml @@ -0,0 +1,102 @@ +name: "PR Tests (aarch64)" +permissions: read-all + +# Trigger for PR and merge to develop branch +on: + push: + branches: develop + pull_request: + workflow_dispatch: + +env: + CTEST_OUTPUT_ON_FAILURE: 1 + LAPACK_VERSION: 3.12.0 + ARMPL_VERSION: 25.04.1 + DPCPP_VERSION: nightly-2025-04-01 + +jobs: + unit-tests: + runs-on: ubuntu-24.04-arm + # One runner for each domain + strategy: + fail-fast: false + matrix: + include: + - config: oneMath BLAS + domain: blas + build_options: -DREF_BLAS_ROOT=${PWD}/lapack/install + - config: oneMath LAPACK + domain: lapack + build_options: -DREF_LAPACK_ROOT=${PWD}/lapack/install + - config: oneMath RNG + domain: rng + test_options: -E 'Device|DEVICE' + name: unit tests ${{ matrix.config }} CPU + steps: + - uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b # v4.1.5 + - name: Check if the changes affect this domain + id: domain_check + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + with: + script: | + const domainCheck = require('.github/scripts/domain-check.js') + return domainCheck({github, context, domain: "${{ matrix.domain }}"}) + - name: Restore netlib from cache + id: cache-lapack + uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2 + with: + path: lapack/install + key: lapack-${{ env.LAPACK_VERSION }}-arm + - name: Install netlib + if: steps.domain_check.outputs.result == 'true' && steps.cache-lapack.outputs.cache-hit != 'true' + run: | + curl -sL https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v${LAPACK_VERSION}.tar.gz | tar zx + SHARED_OPT="lapack-${LAPACK_VERSION} -DBUILD_SHARED_LIBS=on -DCBLAS=on -DLAPACKE=on -DCMAKE_INSTALL_PREFIX=${PWD}/lapack/install -G Ninja" + # 32 bit int + cmake ${SHARED_OPT} -B lapack/build32 + cmake --build lapack/build32 --target install + # 64 bit int + cmake ${SHARED_OPT} -DBUILD_INDEX64=on -B lapack/build64 + cmake --build lapack/build64 --target install + - name: Restore DPC++ from cache + id: cache-dpcpp + uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2 + with: + path: dpcpp/install + key: dpcpp-${{ env.DPCPP_VERSION }}-arm + - name: Build DPC++ + if: steps.domain_check.outputs.result == 'true' && steps.cache-dpcpp.outputs.cache-hit != 'true' + run: | + wget -P dpcpp https://github.com/intel/llvm/archive/refs/tags/${{ env.DPCPP_VERSION }}.tar.gz + cd dpcpp + tar -xzf ${{ env.DPCPP_VERSION }}.tar.gz + python llvm-${{ env.DPCPP_VERSION }}/buildbot/configure.py -o build -t Release --cmake-gen Ninja --native_cpu --cmake-opt="-DSYCL_ENABLE_BACKENDS=native_cpu" --cmake-opt="-DCMAKE_INSTALL_PREFIX=$PWD/install" --llvm-external-projects=openmp + cd build + ninja deploy-sycl-toolchain omp install + - name: Install ArmPL + if: steps.domain_check.outputs.result == 'true' + run: | + mkdir armpl + cd armpl + wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Performance-Libraries/Version_${{ env.ARMPL_VERSION }}/arm-performance-libraries_${{ env.ARMPL_VERSION }}_deb_gcc.tar + tar -xf arm-performance-libraries_${{ env.ARMPL_VERSION }}_deb_gcc.tar + ./arm-performance-libraries_${{ env.ARMPL_VERSION }}_deb/arm-performance-libraries_${{ env.ARMPL_VERSION }}_deb.sh -a -i $PWD/install + - name: Configure/Build for a domain + if: steps.domain_check.outputs.result == 'true' + run: | + export PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/bin:${PWD}/dpcpp/install/bin:${PATH} + export CPATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/include:${PWD}/dpcpp/install/include:${CPATH} + export LIBRARY_PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/lib:${PWD}/dpcpp/install/lib:${PWD}/dpcpp/install/lib/aarch64-unknown-linux-gnu:${LIBRARY_PATH} + export LD_LIBRARY_PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/lib:${PWD}/dpcpp/install/lib:${PWD}/dpcpp/install/lib/aarch64-unknown-linux-gnu:${LD_LIBRARY_PATH} + sycl-ls + cmake -DTARGET_DOMAINS=${{ matrix.domain }} -DENABLE_MKLCPU_BACKEND=off -DENABLE_MKLGPU_BACKEND=off -DENABLE_ARMPL_BACKEND=on -DARMPL_ROOT=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc -DCMAKE_CXX_FLAGS='-fopenmp' -DCMAKE_VERBOSE_MAKEFILE=on ${{ matrix.build_options }} -B build -G Ninja + cmake --build build + - name: Run tests + if: steps.domain_check.outputs.result == 'true' + run: | + export PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/bin:${PWD}/dpcpp/install/bin:${PATH} + export CPATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/include:${PWD}/dpcpp/install/include:${CPATH} + export LIBRARY_PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/lib:${PWD}/dpcpp/install/lib:${PWD}/dpcpp/install/lib/aarch64-unknown-linux-gnu:${LIBRARY_PATH} + export LD_LIBRARY_PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/lib:${PWD}/dpcpp/install/lib:${PWD}/dpcpp/install/lib/aarch64-unknown-linux-gnu:${LD_LIBRARY_PATH} + sycl-ls + ctest --test-dir build ${{ matrix.test_options }} diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 15b4b6227..316be8d47 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -1,4 +1,4 @@ -name: "PR Tests" +name: "PR Tests (x86_64)" permissions: read-all # Trigger for PR and merge to develop branch diff --git a/README.md b/README.md index 7c2766c1e..a423ac777 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ # oneAPI Math Library (oneMath) +[![OpenSSF +Scorecard](https://api.securityscorecards.dev/projects/github.com/uxlfoundation/oneMath/badge)](https://securityscorecards.dev/viewer/?uri=github.com%2Fuxlfoundation%2FoneMath) + oneMath is an open-source implementation of the [oneMath specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/). It can work with multiple devices using multiple libraries (backends) underneath. The oneMath project was previously referred to as oneMKL Interfaces. oneMath is part of the [UXL Foundation](http://www.uxlfoundation.org). @@ -338,7 +341,7 @@ Supported compilers include: NVIDIA GPU NVIDIA cuFFT - Open DPC++ + Open DPC++
AdaptiveCpp Dynamic, Static @@ -349,7 +352,7 @@ Supported compilers include: AMD GPU AMD rocFFT - Open DPC++ + Open DPC++
AdaptiveCpp Dynamic, Static diff --git a/cmake/FindARMPL.cmake b/cmake/FindARMPL.cmake index 2572eeb0a..8e44ffbab 100644 --- a/cmake/FindARMPL.cmake +++ b/cmake/FindARMPL.cmake @@ -18,7 +18,7 @@ #=============================================================================== include_guard() -set(ARMPL_SEQ armpl_intp64) +set(ARMPL_SEQ armpl_int64) set(ARMPL_OMP armpl_int64_mp) include(FindPackageHandleStandardArgs) diff --git a/docs/building_the_project_with_adaptivecpp.rst b/docs/building_the_project_with_adaptivecpp.rst index 41e5b03f7..c4e1ccfca 100644 --- a/docs/building_the_project_with_adaptivecpp.rst +++ b/docs/building_the_project_with_adaptivecpp.rst @@ -57,7 +57,7 @@ additional guidance. The target architectures must be specified with ``HIP_TARGETS``. See the `AdaptiveCpp documentation `_. -If a backend library supports multiple domains (i.e. BLAS, RNG), it may be +If a backend library supports multiple domains (i.e. BLAS, DFT, RNG), it may be desirable to only enable selected domains. For this, the ``TARGET_DOMAINS`` variable should be set. For further details, see :ref:`_build_target_domains`. @@ -81,6 +81,9 @@ The most important supported build options are: * - ENABLE_CUBLAS_BACKEND - True, False - False + * - ENABLE_CUFFT_BACKEND + - True, False + - False * - ENABLE_CURAND_BACKEND - True, False - False @@ -93,6 +96,9 @@ The most important supported build options are: * - ENABLE_ROCBLAS_BACKEND - True, False - False + * - ENABLE_ROCFFT_BACKEND + - True, False + - False * - ENABLE_ROCRAND_BACKEND - True, False - False @@ -106,7 +112,7 @@ The most important supported build options are: - True, False - True * - TARGET_DOMAINS (list) - - blas, rng + - blas, dft, rng - All supported domains Some additional build options are given in @@ -120,8 +126,8 @@ Backends Building for CUDA ~~~~~~~~~~~~~~~~~ -The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND`` and -``ENABLE_CURAND_BACKEND``. +The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND``, +``ENABLE_CUFFT_BACKEND`` and ``ENABLE_CURAND_BACKEND``. The target architecture must be set using the ``HIPSYCL_TARGETS`` parameter. For example, to target a Nvidia A100 (Ampere architecture), set @@ -140,8 +146,8 @@ the CUDA libraries should be found automatically by CMake. Building for ROCm ~~~~~~~~~~~~~~~~~ -The ROCm backends can be enabled with ``ENABLE_ROCBLAS_BACKEND`` and -``ENABLE_ROCRAND_BACKEND``. +The ROCm backends can be enabled with ``ENABLE_ROCBLAS_BACKEND``, +``ENABLE_ROCFFT_BACKEND`` and ``ENABLE_ROCRAND_BACKEND``. The target architecture must be set using the ``HIPSYCL_TARGETS`` parameter. See the `AdaptiveCpp documentation diff --git a/src/dft/backends/cufft/backward.cpp b/src/dft/backends/cufft/backward.cpp index 475f1ea49..41483062e 100644 --- a/src/dft/backends/cufft/backward.cpp +++ b/src/dft/backends/cufft/backward.cpp @@ -76,7 +76,7 @@ ONEMATH_EXPORT void compute_backward(descriptor_type& desc, auto stream = detail::setup_stream(func_name, ih, plan); auto inout_native = reinterpret_cast*>( - ih.get_native_mem(inout_acc)); + ih.get_native_mem(inout_acc)); detail::cufft_execute>( func_name, stream, plan, reinterpret_cast(inout_native + offsets[0]), reinterpret_cast(inout_native + offsets[1])); @@ -121,14 +121,14 @@ ONEMATH_EXPORT void compute_backward(descriptor_type& desc, dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, plan); - auto in_native = reinterpret_cast( - reinterpret_cast*>( - ih.get_native_mem(in_acc)) + - offsets[0]); - auto out_native = reinterpret_cast( - reinterpret_cast*>( - ih.get_native_mem(out_acc)) + - offsets[1]); + auto in_native = + reinterpret_cast(reinterpret_cast*>( + ih.get_native_mem(in_acc)) + + offsets[0]); + auto out_native = + reinterpret_cast(reinterpret_cast*>( + ih.get_native_mem(out_acc)) + + offsets[1]); detail::cufft_execute>( func_name, stream, plan, in_native, out_native); }); diff --git a/src/dft/backends/cufft/commit.cpp b/src/dft/backends/cufft/commit.cpp index b6d2164ff..91cd17971 100644 --- a/src/dft/backends/cufft/commit.cpp +++ b/src/dft/backends/cufft/commit.cpp @@ -34,6 +34,8 @@ #include "oneapi/math/dft/detail/cufft/onemath_dft_cufft.hpp" #include "oneapi/math/dft/types.hpp" +#include "execute_helper.hpp" +#include "../../execute_helper_generic.hpp" #include "../stride_helper.hpp" #include @@ -84,7 +86,7 @@ class cufft_commit final : public dft::detail::commit_impl { if (fix_context) { // cufftDestroy changes the context so change it back. CUdevice interopDevice = - sycl::get_native(this->get_queue().get_device()); + sycl::get_native(this->get_queue().get_device()); CUcontext interopContext; if (cuDevicePrimaryCtxRetain(&interopContext, interopDevice) != CUDA_SUCCESS) { throw math::exception("dft/backends/cufft", __FUNCTION__, @@ -353,8 +355,8 @@ class cufft_commit final : public dft::detail::commit_impl { .submit([&](sycl::handler& cgh) { auto workspace_acc = buffer_workspace.template get_access(cgh); - cgh.host_task([=](sycl::interop_handle ih) { - auto stream = ih.get_native_queue(); + dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) { + auto stream = ih.get_native_queue(); auto result = cufftSetStream(plan, stream); if (result != CUFFT_SUCCESS) { throw oneapi::math::exception( @@ -362,7 +364,7 @@ class cufft_commit final : public dft::detail::commit_impl { "cufftSetStream returned " + std::to_string(result)); } auto workspace_native = reinterpret_cast( - ih.get_native_mem(workspace_acc)); + ih.get_native_mem(workspace_acc)); cufftSetWorkArea(plan, workspace_native); }); }) diff --git a/src/dft/backends/cufft/execute_helper.hpp b/src/dft/backends/cufft/execute_helper.hpp index da485fea2..3d5f6e791 100644 --- a/src/dft/backends/cufft/execute_helper.hpp +++ b/src/dft/backends/cufft/execute_helper.hpp @@ -36,6 +36,12 @@ namespace oneapi::math::dft::cufft::detail { +#if defined(__ADAPTIVECPP__) || defined(__HIPSYCL__) +constexpr auto sycl_cuda_backend{ sycl::backend::cuda }; +#else // DPC++ +constexpr auto sycl_cuda_backend{ sycl::backend::ext_oneapi_cuda }; +#endif + template inline dft::detail::commit_impl* checked_get_commit( dft::detail::descriptor& desc) { @@ -142,7 +148,7 @@ void cufft_execute(const std::string& func, CUstream stream, cufftHandle plan, v } inline CUstream setup_stream(const std::string& func, sycl::interop_handle ih, cufftHandle plan) { - auto stream = ih.get_native_queue(); + auto stream = ih.get_native_queue(); auto result = cufftSetStream(plan, stream); if (result != CUFFT_SUCCESS) { throw oneapi::math::exception("dft/backends/cufft", func, diff --git a/src/dft/backends/cufft/forward.cpp b/src/dft/backends/cufft/forward.cpp index 6b2867b5f..324c83142 100644 --- a/src/dft/backends/cufft/forward.cpp +++ b/src/dft/backends/cufft/forward.cpp @@ -79,7 +79,7 @@ ONEMATH_EXPORT void compute_forward(descriptor_type& desc, auto stream = detail::setup_stream(func_name, ih, plan); auto inout_native = reinterpret_cast*>( - ih.get_native_mem(inout_acc)); + ih.get_native_mem(inout_acc)); detail::cufft_execute>( func_name, stream, plan, reinterpret_cast(inout_native + offsets[0]), reinterpret_cast(inout_native + offsets[1])); @@ -124,14 +124,14 @@ ONEMATH_EXPORT void compute_forward(descriptor_type& desc, dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, plan); - auto in_native = reinterpret_cast( - reinterpret_cast*>( - ih.get_native_mem(in_acc)) + - offsets[0]); - auto out_native = reinterpret_cast( - reinterpret_cast*>( - ih.get_native_mem(out_acc)) + - offsets[1]); + auto in_native = + reinterpret_cast(reinterpret_cast*>( + ih.get_native_mem(in_acc)) + + offsets[0]); + auto out_native = + reinterpret_cast(reinterpret_cast*>( + ih.get_native_mem(out_acc)) + + offsets[1]); detail::cufft_execute>( func_name, stream, plan, in_native, out_native); }); diff --git a/src/dft/backends/rocfft/commit.cpp b/src/dft/backends/rocfft/commit.cpp index 4c5d51d2f..47f12f336 100644 --- a/src/dft/backends/rocfft/commit.cpp +++ b/src/dft/backends/rocfft/commit.cpp @@ -34,6 +34,8 @@ #include "oneapi/math/dft/detail/rocfft/onemath_dft_rocfft.hpp" #include "oneapi/math/dft/types.hpp" +#include "execute_helper.hpp" +#include "../../execute_helper_generic.hpp" #include "../stride_helper.hpp" #include "rocfft_handle.hpp" @@ -557,9 +559,9 @@ class rocfft_commit final : public dft::detail::commit_impl { this->get_queue().submit([&](sycl::handler& cgh) { auto workspace_acc = buffer_workspace.template get_access(cgh); - cgh.host_task([=](sycl::interop_handle ih) { + dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) { auto workspace_native = reinterpret_cast( - ih.get_native_mem(workspace_acc)); + ih.get_native_mem(workspace_acc)); set_workspace_impl(handle, workspace_native, workspace_bytes, "set_workspace"); }); }); diff --git a/src/dft/backends/rocfft/execute_helper.hpp b/src/dft/backends/rocfft/execute_helper.hpp index c1ee6302b..27218d0ad 100644 --- a/src/dft/backends/rocfft/execute_helper.hpp +++ b/src/dft/backends/rocfft/execute_helper.hpp @@ -36,6 +36,12 @@ namespace oneapi::math::dft::rocfft::detail { +#if defined(__ADAPTIVECPP__) || defined(__HIPSYCL__) +constexpr auto sycl_hip_backend{ sycl::backend::hip }; +#else // DPC++ +constexpr auto sycl_hip_backend{ sycl::backend::ext_oneapi_hip }; +#endif + template inline dft::detail::commit_impl* checked_get_commit( dft::detail::descriptor& desc) { @@ -60,12 +66,12 @@ inline auto expect_config(DescT& desc, const char* message) { template inline void* native_mem(sycl::interop_handle& ih, Acc& buf) { - return ih.get_native_mem(buf); + return ih.get_native_mem(buf); } inline hipStream_t setup_stream(const std::string& func, sycl::interop_handle& ih, rocfft_execution_info info) { - auto stream = ih.get_native_queue(); + auto stream = ih.get_native_queue(); auto result = rocfft_execution_info_set_stream(info, stream); if (result != rocfft_status_success) { throw oneapi::math::exception( diff --git a/src/dft/execute_helper_generic.hpp b/src/dft/execute_helper_generic.hpp index 4f5ae6727..a40a3f465 100644 --- a/src/dft/execute_helper_generic.hpp +++ b/src/dft/execute_helper_generic.hpp @@ -39,7 +39,11 @@ namespace oneapi::math::dft::detail { */ template static inline void fft_enqueue_task(HandlerT&& cgh, FnT&& f) { -#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND +#if defined(__ADAPTIVECPP__) + cgh.AdaptiveCpp_enqueue_custom_operation([=](sycl::interop_handle ih) { +#elif defined(__HIPSYCL__) + cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) { +#elif defined(SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND) cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { #else cgh.host_task([=](sycl::interop_handle ih) { diff --git a/src/lapack/backends/armpl/armpl_common.hpp b/src/lapack/backends/armpl/armpl_common.hpp index 0e1d67560..8bdca7d3a 100644 --- a/src/lapack/backends/armpl/armpl_common.hpp +++ b/src/lapack/backends/armpl/armpl_common.hpp @@ -142,7 +142,12 @@ inline constexpr bool is_complex = true; template constexpr auto cast_to_int_if_complex(const T& alpha) { if constexpr (is_complex) { - return static_cast((*((T*)&alpha))); + //armpl 25.04 uses directly std::complex so most of the ArmEquivalentType gymnastics is redundant + if constexpr (std::is_same_v> || + std::is_same_v>) + return static_cast(alpha.real()); + else + return static_cast((*((T*)&alpha))); } else { return (std::int64_t)alpha; diff --git a/src/rng/backends/mklcpu/cpu_common.hpp b/src/rng/backends/mklcpu/cpu_common.hpp index 559f27960..4ee2e487e 100644 --- a/src/rng/backends/mklcpu/cpu_common.hpp +++ b/src/rng/backends/mklcpu/cpu_common.hpp @@ -34,20 +34,20 @@ namespace mklcpu { // host_task automatically uses run_on_host_intel if it is supported by the // compiler. Otherwise, it falls back to single_task. template -static inline auto host_task_internal(H& cgh, F f, int) -> decltype(cgh.host_task(f)) { - return cgh.host_task(f); +static inline auto host_task_internal(H& cgh, F&& f, int) { + return cgh.host_task(std::forward(f)); } template -static inline void host_task_internal(H& cgh, F f, long) { +static inline void host_task_internal(H& cgh, F&& f, long) { #ifndef __SYCL_DEVICE_ONLY__ - cgh.template single_task(f); + cgh.template single_task(std::forward(f)); #endif } template -static inline void host_task(H& cgh, F f) { - (void)host_task_internal(cgh, f, 0); +static inline void host_task(H& cgh, F&& f) { + (void)host_task_internal(cgh, std::forward(f), 0); } template diff --git a/tests/unit_tests/dft/source/descriptor_tests.cpp b/tests/unit_tests/dft/source/descriptor_tests.cpp index a4290e553..a825a1539 100644 --- a/tests/unit_tests/dft/source/descriptor_tests.cpp +++ b/tests/unit_tests/dft/source/descriptor_tests.cpp @@ -571,7 +571,9 @@ inline void recommit_values(sycl::queue& sycl_queue) { } template -inline void change_queue_causes_wait(sycl::queue& busy_queue) { +inline void change_queue_causes_wait([[maybe_unused]] sycl::queue& busy_queue) { +// Skip this test in AdaptiveCpp, which doesn't support host_task +#if !defined(__ADAPTIVECPP__) && !defined(__HIPSYCL__) // create a queue with work on it, and then show that work is waited on when the descriptor // is committed to a new queue. // its possible to have a false positive result, but a false negative should not be possible. @@ -616,6 +618,7 @@ inline void change_queue_causes_wait(sycl::queue& busy_queue) { // busy queue task has now completed. auto after_status = e.template get_info(); ASSERT_EQ(after_status, sycl::info::event_command_status::complete); +#endif } template