rafbiels · rafbiels · Apr 25, 2025 · Apr 30, 2025 · May 1, 2025 · May 2, 2025
diff --git a/.github/workflows/pr-arm.yml b/.github/workflows/pr-arm.yml
@@ -0,0 +1,102 @@
+name: "PR Tests (aarch64)"
+permissions: read-all
+
+# Trigger for PR and merge to develop branch
+on:
+  push:
+    branches: develop
+  pull_request:
+  workflow_dispatch:
+
+env:
+  CTEST_OUTPUT_ON_FAILURE: 1
+  LAPACK_VERSION: 3.12.0
+  ARMPL_VERSION: 25.04.1
+  DPCPP_VERSION: nightly-2025-04-01
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-24.04-arm
+    # One runner for each domain
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - config: oneMath BLAS
+          domain: blas
+          build_options: -DREF_BLAS_ROOT=${PWD}/lapack/install
+        - config: oneMath LAPACK
+          domain: lapack
+          build_options: -DREF_LAPACK_ROOT=${PWD}/lapack/install
+        - config: oneMath RNG
+          domain: rng
+          test_options: -E 'Device|DEVICE'
+    name: unit tests ${{ matrix.config }} CPU
+    steps:
+    - uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b # v4.1.5
+    - name: Check if the changes affect this domain
+      id: domain_check
+      uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+      with:
+        script: |
+         const domainCheck = require('.github/scripts/domain-check.js')
+         return domainCheck({github, context, domain: "${{ matrix.domain }}"})
+    - name: Restore netlib from cache
+      id: cache-lapack
+      uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+      with:
+        path: lapack/install
+        key: lapack-${{ env.LAPACK_VERSION }}-arm
+    - name: Install netlib
+      if: steps.domain_check.outputs.result == 'true' && steps.cache-lapack.outputs.cache-hit != 'true'
+      run: |
+        curl -sL https://github.com/Reference-LAPACK/lapack/archive/refs/tags/v${LAPACK_VERSION}.tar.gz | tar zx
+        SHARED_OPT="lapack-${LAPACK_VERSION} -DBUILD_SHARED_LIBS=on -DCBLAS=on -DLAPACKE=on -DCMAKE_INSTALL_PREFIX=${PWD}/lapack/install -G Ninja"
+        # 32 bit int
+        cmake ${SHARED_OPT} -B lapack/build32
+        cmake --build lapack/build32 --target install
+        # 64 bit int
+        cmake ${SHARED_OPT} -DBUILD_INDEX64=on -B lapack/build64
+        cmake --build lapack/build64 --target install
+    - name: Restore DPC++ from cache
+      id: cache-dpcpp
+      uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+      with:
+        path: dpcpp/install
+        key: dpcpp-${{ env.DPCPP_VERSION }}-arm
+    - name: Build DPC++
+      if: steps.domain_check.outputs.result == 'true' && steps.cache-dpcpp.outputs.cache-hit != 'true'
+      run: |
+        wget -P dpcpp https://github.com/intel/llvm/archive/refs/tags/${{ env.DPCPP_VERSION }}.tar.gz
+        cd dpcpp
+        tar -xzf ${{ env.DPCPP_VERSION }}.tar.gz
+        python llvm-${{ env.DPCPP_VERSION }}/buildbot/configure.py -o build -t Release --cmake-gen Ninja --native_cpu --cmake-opt="-DSYCL_ENABLE_BACKENDS=native_cpu" --cmake-opt="-DCMAKE_INSTALL_PREFIX=$PWD/install" --llvm-external-projects=openmp
+        cd build
+        ninja deploy-sycl-toolchain omp install
+    - name: Install ArmPL
+      if: steps.domain_check.outputs.result == 'true'
+      run: |
+        mkdir armpl
+        cd armpl
+        wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Performance-Libraries/Version_${{ env.ARMPL_VERSION }}/arm-performance-libraries_${{ env.ARMPL_VERSION }}_deb_gcc.tar
+        tar -xf arm-performance-libraries_${{ env.ARMPL_VERSION }}_deb_gcc.tar
+        ./arm-performance-libraries_${{ env.ARMPL_VERSION }}_deb/arm-performance-libraries_${{ env.ARMPL_VERSION }}_deb.sh -a -i $PWD/install
+    - name: Configure/Build for a domain
+      if: steps.domain_check.outputs.result == 'true'
+      run: |
+        export PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/bin:${PWD}/dpcpp/install/bin:${PATH}
+        export CPATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/include:${PWD}/dpcpp/install/include:${CPATH}
+        export LIBRARY_PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/lib:${PWD}/dpcpp/install/lib:${PWD}/dpcpp/install/lib/aarch64-unknown-linux-gnu:${LIBRARY_PATH}
+        export LD_LIBRARY_PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/lib:${PWD}/dpcpp/install/lib:${PWD}/dpcpp/install/lib/aarch64-unknown-linux-gnu:${LD_LIBRARY_PATH}
+        sycl-ls
+        cmake -DTARGET_DOMAINS=${{ matrix.domain }} -DENABLE_MKLCPU_BACKEND=off -DENABLE_MKLGPU_BACKEND=off -DENABLE_ARMPL_BACKEND=on -DARMPL_ROOT=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc -DCMAKE_CXX_FLAGS='-fopenmp' -DCMAKE_VERBOSE_MAKEFILE=on ${{ matrix.build_options }} -B build -G Ninja
+        cmake --build build
+    - name: Run tests
+      if: steps.domain_check.outputs.result == 'true'
+      run: |
+        export PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/bin:${PWD}/dpcpp/install/bin:${PATH}
+        export CPATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/include:${PWD}/dpcpp/install/include:${CPATH}
+        export LIBRARY_PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/lib:${PWD}/dpcpp/install/lib:${PWD}/dpcpp/install/lib/aarch64-unknown-linux-gnu:${LIBRARY_PATH}
+        export LD_LIBRARY_PATH=${PWD}/armpl/install/armpl_${{ env.ARMPL_VERSION }}_gcc/lib:${PWD}/dpcpp/install/lib:${PWD}/dpcpp/install/lib/aarch64-unknown-linux-gnu:${LD_LIBRARY_PATH}
+        sycl-ls
+        ctest --test-dir build ${{ matrix.test_options }}
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
@@ -1,4 +1,4 @@
-name: "PR Tests"
+name: "PR Tests (x86_64)"
 permissions: read-all
 
 # Trigger for PR and merge to develop branch

diff --git a/README.md b/README.md
@@ -2,6 +2,9 @@
 
 # oneAPI Math Library (oneMath)
 
+[![OpenSSF
+Scorecard](https://api.securityscorecards.dev/projects/github.com/uxlfoundation/oneMath/badge)](https://securityscorecards.dev/viewer/?uri=github.com%2Fuxlfoundation%2FoneMath)
+
 oneMath is an open-source implementation of the [oneMath specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/). It can work with multiple devices using multiple libraries (backends) underneath. The oneMath project was previously referred to as oneMKL Interfaces.
 
 oneMath is part of the [UXL Foundation](http://www.uxlfoundation.org).
@@ -338,7 +341,7 @@ Supported compilers include:
         <tr>
             <td rowspan=2 align="center">NVIDIA GPU</td>
             <td align="center">NVIDIA cuFFT</td>
-            <td align="center">Open DPC++</td>
+            <td align="center">Open DPC++</br>AdaptiveCpp</td>
             <td align="center">Dynamic, Static</td>
         </tr>
         <tr>
@@ -349,7 +352,7 @@ Supported compilers include:
         <tr>
             <td rowspan=2 align="center">AMD GPU</td>
             <td align="center">AMD rocFFT</td>
-            <td align="center">Open DPC++</td>
+            <td align="center">Open DPC++</br>AdaptiveCpp</td>
             <td align="center">Dynamic, Static</td>
         </tr>
         <tr>

diff --git a/cmake/FindARMPL.cmake b/cmake/FindARMPL.cmake
@@ -18,7 +18,7 @@
 #===============================================================================
 
 include_guard()
-set(ARMPL_SEQ armpl_intp64)
+set(ARMPL_SEQ armpl_int64)
 set(ARMPL_OMP armpl_int64_mp)
 
 include(FindPackageHandleStandardArgs)

diff --git a/docs/building_the_project_with_adaptivecpp.rst b/docs/building_the_project_with_adaptivecpp.rst
@@ -57,7 +57,7 @@ additional guidance. The target architectures must be specified with
 ``HIP_TARGETS``. See the `AdaptiveCpp documentation
 <https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/using-hipsycl.md#adaptivecpp-targets-specification>`_.
 
-If a backend library supports multiple domains (i.e. BLAS, RNG), it may be
+If a backend library supports multiple domains (i.e. BLAS, DFT, RNG), it may be
 desirable to only enable selected domains. For this, the ``TARGET_DOMAINS``
 variable should be set. For further details, see :ref:`_build_target_domains`.
 
@@ -81,6 +81,9 @@ The most important supported build options are:
    * - ENABLE_CUBLAS_BACKEND
      - True, False
      - False     
+   * - ENABLE_CUFFT_BACKEND
+     - True, False
+     - False
    * - ENABLE_CURAND_BACKEND
      - True, False
      - False     
@@ -93,6 +96,9 @@ The most important supported build options are:
    * - ENABLE_ROCBLAS_BACKEND
      - True, False
      - False     
+   * - ENABLE_ROCFFT_BACKEND
+     - True, False
+     - False
    * - ENABLE_ROCRAND_BACKEND
      - True, False
      - False     
@@ -106,7 +112,7 @@ The most important supported build options are:
      - True, False
      - True      
    * - TARGET_DOMAINS (list)
-     - blas, rng
+     - blas, dft, rng
      - All supported domains
 
 Some additional build options are given in
@@ -120,8 +126,8 @@ Backends
 Building for CUDA
 ~~~~~~~~~~~~~~~~~
 
-The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND`` and
-``ENABLE_CURAND_BACKEND``.
+The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND``,
+``ENABLE_CUFFT_BACKEND`` and ``ENABLE_CURAND_BACKEND``.
 
 The target architecture must be set using the ``HIPSYCL_TARGETS`` parameter. For
 example, to target a Nvidia A100 (Ampere architecture), set
@@ -140,8 +146,8 @@ the CUDA libraries should be found automatically by CMake.
 Building for ROCm
 ~~~~~~~~~~~~~~~~~
 
-The ROCm backends can be enabled with ``ENABLE_ROCBLAS_BACKEND`` and
-``ENABLE_ROCRAND_BACKEND``.
+The ROCm backends can be enabled with ``ENABLE_ROCBLAS_BACKEND``,
+``ENABLE_ROCFFT_BACKEND`` and ``ENABLE_ROCRAND_BACKEND``.
 
 The target architecture must be set using the ``HIPSYCL_TARGETS`` parameter. See
 the `AdaptiveCpp documentation

diff --git a/src/dft/backends/cufft/backward.cpp b/src/dft/backends/cufft/backward.cpp
@@ -76,7 +76,7 @@ ONEMATH_EXPORT void compute_backward(descriptor_type& desc,
             auto stream = detail::setup_stream(func_name, ih, plan);
 
             auto inout_native = reinterpret_cast<fwd<descriptor_type>*>(
-                ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(inout_acc));
+                ih.get_native_mem<detail::sycl_cuda_backend>(inout_acc));
             detail::cufft_execute<detail::Direction::Backward, fwd<descriptor_type>>(
                 func_name, stream, plan, reinterpret_cast<void*>(inout_native + offsets[0]),
                 reinterpret_cast<void*>(inout_native + offsets[1]));
@@ -121,14 +121,14 @@ ONEMATH_EXPORT void compute_backward(descriptor_type& desc,
         dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) {
             auto stream = detail::setup_stream(func_name, ih, plan);
 
-            auto in_native = reinterpret_cast<void*>(
-                reinterpret_cast<bwd<descriptor_type>*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(in_acc)) +
-                offsets[0]);
-            auto out_native = reinterpret_cast<void*>(
-                reinterpret_cast<fwd<descriptor_type>*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(out_acc)) +
-                offsets[1]);
+            auto in_native =
+                reinterpret_cast<void*>(reinterpret_cast<bwd<descriptor_type>*>(
+                                            ih.get_native_mem<detail::sycl_cuda_backend>(in_acc)) +
+                                        offsets[0]);
+            auto out_native =
+                reinterpret_cast<void*>(reinterpret_cast<fwd<descriptor_type>*>(
+                                            ih.get_native_mem<detail::sycl_cuda_backend>(out_acc)) +
+                                        offsets[1]);
             detail::cufft_execute<detail::Direction::Backward, fwd<descriptor_type>>(
                 func_name, stream, plan, in_native, out_native);
         });

diff --git a/src/dft/backends/cufft/commit.cpp b/src/dft/backends/cufft/commit.cpp
@@ -34,6 +34,8 @@
 #include "oneapi/math/dft/detail/cufft/onemath_dft_cufft.hpp"
 #include "oneapi/math/dft/types.hpp"
 
+#include "execute_helper.hpp"
+#include "../../execute_helper_generic.hpp"
 #include "../stride_helper.hpp"
 
 #include <cufft.h>
@@ -84,7 +86,7 @@ class cufft_commit final : public dft::detail::commit_impl<prec, dom> {
         if (fix_context) {
             // cufftDestroy changes the context so change it back.
             CUdevice interopDevice =
-                sycl::get_native<sycl::backend::ext_oneapi_cuda>(this->get_queue().get_device());
+                sycl::get_native<sycl_cuda_backend>(this->get_queue().get_device());
             CUcontext interopContext;
             if (cuDevicePrimaryCtxRetain(&interopContext, interopDevice) != CUDA_SUCCESS) {
                 throw math::exception("dft/backends/cufft", __FUNCTION__,
@@ -353,16 +355,16 @@ class cufft_commit final : public dft::detail::commit_impl<prec, dom> {
             .submit([&](sycl::handler& cgh) {
                 auto workspace_acc =
                     buffer_workspace.template get_access<sycl::access::mode::read_write>(cgh);
-                cgh.host_task([=](sycl::interop_handle ih) {
-                    auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+                dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) {
+                    auto stream = ih.get_native_queue<sycl_cuda_backend>();
                     auto result = cufftSetStream(plan, stream);
                     if (result != CUFFT_SUCCESS) {
                         throw oneapi::math::exception(
                             "dft/backends/cufft", "set_workspace",
                             "cufftSetStream returned " + std::to_string(result));
                     }
                     auto workspace_native = reinterpret_cast<scalar_type*>(
-                        ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(workspace_acc));
+                        ih.get_native_mem<sycl_cuda_backend>(workspace_acc));
                     cufftSetWorkArea(plan, workspace_native);
                 });
             })

diff --git a/src/dft/backends/cufft/execute_helper.hpp b/src/dft/backends/cufft/execute_helper.hpp
@@ -36,6 +36,12 @@
 
 namespace oneapi::math::dft::cufft::detail {
 
+#if defined(__ADAPTIVECPP__) || defined(__HIPSYCL__)
+constexpr auto sycl_cuda_backend{ sycl::backend::cuda };
+#else // DPC++
+constexpr auto sycl_cuda_backend{ sycl::backend::ext_oneapi_cuda };
+#endif
+
 template <dft::precision prec, dft::domain dom>
 inline dft::detail::commit_impl<prec, dom>* checked_get_commit(
     dft::detail::descriptor<prec, dom>& desc) {
@@ -142,7 +148,7 @@ void cufft_execute(const std::string& func, CUstream stream, cufftHandle plan, v
 }
 
 inline CUstream setup_stream(const std::string& func, sycl::interop_handle ih, cufftHandle plan) {
-    auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
+    auto stream = ih.get_native_queue<sycl_cuda_backend>();
     auto result = cufftSetStream(plan, stream);
     if (result != CUFFT_SUCCESS) {
         throw oneapi::math::exception("dft/backends/cufft", func,

diff --git a/src/dft/backends/cufft/forward.cpp b/src/dft/backends/cufft/forward.cpp
@@ -79,7 +79,7 @@ ONEMATH_EXPORT void compute_forward(descriptor_type& desc,
             auto stream = detail::setup_stream(func_name, ih, plan);
 
             auto inout_native = reinterpret_cast<fwd<descriptor_type>*>(
-                ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(inout_acc));
+                ih.get_native_mem<detail::sycl_cuda_backend>(inout_acc));
             detail::cufft_execute<detail::Direction::Forward, fwd<descriptor_type>>(
                 func_name, stream, plan, reinterpret_cast<void*>(inout_native + offsets[0]),
                 reinterpret_cast<void*>(inout_native + offsets[1]));
@@ -124,14 +124,14 @@ ONEMATH_EXPORT void compute_forward(descriptor_type& desc,
         dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) {
             auto stream = detail::setup_stream(func_name, ih, plan);
 
-            auto in_native = reinterpret_cast<void*>(
-                reinterpret_cast<fwd<descriptor_type>*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(in_acc)) +
-                offsets[0]);
-            auto out_native = reinterpret_cast<void*>(
-                reinterpret_cast<bwd<descriptor_type>*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(out_acc)) +
-                offsets[1]);
+            auto in_native =
+                reinterpret_cast<void*>(reinterpret_cast<fwd<descriptor_type>*>(
+                                            ih.get_native_mem<detail::sycl_cuda_backend>(in_acc)) +
+                                        offsets[0]);
+            auto out_native =
+                reinterpret_cast<void*>(reinterpret_cast<bwd<descriptor_type>*>(
+                                            ih.get_native_mem<detail::sycl_cuda_backend>(out_acc)) +
+                                        offsets[1]);
             detail::cufft_execute<detail::Direction::Forward, fwd<descriptor_type>>(
                 func_name, stream, plan, in_native, out_native);
         });

diff --git a/src/dft/backends/rocfft/commit.cpp b/src/dft/backends/rocfft/commit.cpp
@@ -34,6 +34,8 @@
 #include "oneapi/math/dft/detail/rocfft/onemath_dft_rocfft.hpp"
 #include "oneapi/math/dft/types.hpp"
 
+#include "execute_helper.hpp"
+#include "../../execute_helper_generic.hpp"
 #include "../stride_helper.hpp"
 
 #include "rocfft_handle.hpp"
@@ -557,9 +559,9 @@ class rocfft_commit final : public dft::detail::commit_impl<prec, dom> {
         this->get_queue().submit([&](sycl::handler& cgh) {
             auto workspace_acc =
                 buffer_workspace.template get_access<sycl::access::mode::read_write>(cgh);
-            cgh.host_task([=](sycl::interop_handle ih) {
+            dft::detail::fft_enqueue_task(cgh, [=](sycl::interop_handle ih) {
                 auto workspace_native = reinterpret_cast<scalar_type*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_hip>(workspace_acc));
+                    ih.get_native_mem<sycl_hip_backend>(workspace_acc));
                 set_workspace_impl(handle, workspace_native, workspace_bytes, "set_workspace");
             });
         });

diff --git a/src/dft/backends/rocfft/execute_helper.hpp b/src/dft/backends/rocfft/execute_helper.hpp
@@ -36,6 +36,12 @@
 
 namespace oneapi::math::dft::rocfft::detail {
 
+#if defined(__ADAPTIVECPP__) || defined(__HIPSYCL__)
+constexpr auto sycl_hip_backend{ sycl::backend::hip };
+#else // DPC++
+constexpr auto sycl_hip_backend{ sycl::backend::ext_oneapi_hip };
+#endif
+
 template <dft::precision prec, dft::domain dom>
 inline dft::detail::commit_impl<prec, dom>* checked_get_commit(
     dft::detail::descriptor<prec, dom>& desc) {
@@ -60,12 +66,12 @@ inline auto expect_config(DescT& desc, const char* message) {
 
 template <typename Acc>
 inline void* native_mem(sycl::interop_handle& ih, Acc& buf) {
-    return ih.get_native_mem<sycl::backend::ext_oneapi_hip>(buf);
+    return ih.get_native_mem<sycl_hip_backend>(buf);
 }
 
 inline hipStream_t setup_stream(const std::string& func, sycl::interop_handle& ih,
                                 rocfft_execution_info info) {
-    auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
+    auto stream = ih.get_native_queue<sycl_hip_backend>();
     auto result = rocfft_execution_info_set_stream(info, stream);
     if (result != rocfft_status_success) {
         throw oneapi::math::exception(

diff --git a/src/dft/execute_helper_generic.hpp b/src/dft/execute_helper_generic.hpp
@@ -39,7 +39,11 @@ namespace oneapi::math::dft::detail {
  */
 template <typename HandlerT, typename FnT>
 static inline void fft_enqueue_task(HandlerT&& cgh, FnT&& f) {
-#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+#if defined(__ADAPTIVECPP__)
+    cgh.AdaptiveCpp_enqueue_custom_operation([=](sycl::interop_handle ih) {
+#elif defined(__HIPSYCL__)
+    cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) {
+#elif defined(SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND)
     cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
 #else
     cgh.host_task([=](sycl::interop_handle ih) {