diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..05c1669
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1 @@
+*       @oleksandr-pavlyk @xaleryb @ekomarova
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..5ace460
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
new file mode 100644
index 0000000..0c5bd0b
--- /dev/null
+++ b/.github/workflows/conda-package.yml
@@ -0,0 +1,290 @@
+name: Conda package
+
+on: push
+
+env:
+  PACKAGE_NAME: mkl_umath
+  MODULE_NAME: mkl_umath
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: ['3.10', '3.11', '3.12']
+    steps:
+      - uses: actions/checkout@v4.1.7
+        with:
+          fetch-depth: 0
+
+      - name: Set pkgs_dirs
+        run: |
+          echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
+
+      - name: Cache conda packages
+        uses: actions/cache@v4
+        env:
+          CACHE_NUMBER: 0  # Increase to reset cache
+        with:
+          path: ~/.conda/pkgs
+          key:
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-${{hashFiles('**/meta.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
+
+      - name: Add conda to system path
+        run: echo $CONDA/bin >> $GITHUB_PATH
+
+      - name: Install conda-build
+        run: conda install conda-build
+
+      - name: Build conda package
+        run: |
+          CHANNELS="-c conda-forge -c https://software.repos.intel.com/python/conda --override-channels"
+          VERSIONS="--python ${{ matrix.python }}"
+          TEST="--no-test"
+          echo "CONDA_BLD=${CONDA}/conda-bld/linux-64" >> $GITHUB_ENV
+
+          conda build \
+            $TEST \
+            $VERSIONS \
+            $CHANNELS \
+            conda-recipe-cf
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4.4.0
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+          path: ${{ env.CONDA_BLD }}/${{ env.PACKAGE_NAME }}-*.tar.bz2
+
+  test:
+    needs: build
+    runs-on:  ${{ matrix.runner }}
+
+    strategy:
+      matrix:
+        python: ['3.10', '3.11', '3.12']
+        experimental: [false]
+        runner: [ubuntu-latest]
+    continue-on-error: ${{ matrix.experimental }}
+    env:
+      CHANNELS: -c conda-forge -c https://software.repos.intel.com/python/conda --override-channels
+
+    steps:
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+      - name: Add conda to system path
+        run: echo $CONDA/bin >> $GITHUB_PATH
+      - name: Install conda-build
+        run: conda install conda-build
+      - name: Create conda channel
+        run: |
+          mkdir -p $GITHUB_WORKSPACE/channel/linux-64
+          mv ${PACKAGE_NAME}-*.tar.bz2 $GITHUB_WORKSPACE/channel/linux-64
+          conda index $GITHUB_WORKSPACE/channel
+          # Test channel
+          conda search $PACKAGE_NAME -c $GITHUB_WORKSPACE/channel --override-channels
+
+      - name: Collect dependencies
+        run: |
+          CHANNELS="-c $GITHUB_WORKSPACE/channel ${{ env.CHANNELS }}"
+          conda create -n test_mkl_umath $PACKAGE_NAME python=${{ matrix.python }} $CHANNELS --only-deps --dry-run > lockfile
+      - name: Display lockfile
+        run: cat lockfile
+
+      - name: Set pkgs_dirs
+        run: |
+          echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
+
+      - name: Cache conda packages
+        uses: actions/cache@v4
+        env:
+          CACHE_NUMBER: 0  # Increase to reset cache
+        with:
+          path: ~/.conda/pkgs
+          key:
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-${{hashFiles('lockfile') }}
+          restore-keys: |
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
+
+      - name: Install mkl_umath
+        run: |
+          CHANNELS="-c $GITHUB_WORKSPACE/channel ${{ env.CHANNELS }}"
+          conda create -n test_mkl_umath python=${{ matrix.python }} $PACKAGE_NAME pytest $CHANNELS
+          # Test installed packages
+          conda list -n test_mkl_umath
+
+      - name: Run tests
+        run: |
+          source $CONDA/etc/profile.d/conda.sh
+          conda activate test_mkl_umath
+          python -c "import mkl_umath, numpy as np; mkl_umath.use_in_numpy(); np.sin(np.linspace(0, 1, num=10**6));"
+
+  build_windows:
+    runs-on: windows-2019
+
+    strategy:
+      matrix:
+        python: ['3.10', '3.11', '3.12']
+    env:
+      conda-bld: C:\Miniconda\conda-bld\win-64\
+    steps:
+      - uses: actions/checkout@v4.1.7
+        with:
+          fetch-depth: 0
+
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniforge-variant: Miniforge3
+          miniforge-version: latest
+          activate-environment: build
+          channels: conda-forge
+          python-version: ${{ matrix.python }}
+
+      - name: Cache conda packages
+        uses: actions/cache@v4
+        env:
+          CACHE_NUMBER: 3  # Increase to reset cache
+        with:
+          path: /home/runner/conda_pkgs_dir
+          key:
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-${{hashFiles('**/meta.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
+
+      - name: Store conda paths as envs
+        shell: bash -l {0}
+        run: |
+          echo "CONDA_BLD=$CONDA/conda-bld/win-64/" | tr "\\\\" '/' >> $GITHUB_ENV
+
+      - name: Install conda build
+        run: |
+          conda activate
+          conda install -y conda-build
+          conda list -n base
+
+      - name: Build conda package
+        run: |
+          conda activate
+          conda build --no-test --python ${{ matrix.python }} -c conda-forge -c https://software.repos.intel.com/python/conda --override-channels conda-recipe-cf
+
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4.4.0
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+          path: ${{ env.CONDA_BLD }}${{ env.PACKAGE_NAME }}-*.tar.bz2
+
+  test_windows:
+    needs: build_windows
+    runs-on:  ${{ matrix.runner }}
+    defaults:
+      run:
+        shell: cmd /C CALL {0}
+    strategy:
+      matrix:
+        python: ['3.10', '3.11', '3.12']
+        experimental: [false]
+        runner: [windows-2019]
+    continue-on-error: ${{ matrix.experimental }}
+    env:
+      workdir: '${{ github.workspace }}'
+      CHANNELS: -c conda-forge -c https://software.repos.intel.com/python/conda --override-channels
+
+    steps:
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          auto-update-conda: true
+          conda-build-version: '*'
+          miniforge-variant: Miniforge3
+          miniforge-version: latest
+          activate-environment: mkl_umath_test
+          channels: conda-forge
+          python-version: ${{ matrix.python }}
+
+      - name: Create conda channel with the artifact bit
+        shell: cmd /C CALL {0}
+        run: |
+          echo ${{ env.workdir }}
+          mkdir ${{ env.workdir }}\channel\win-64
+          move ${{ env.PACKAGE_NAME }}-*.tar.bz2 ${{ env.workdir }}\channel\win-64
+          dir ${{ env.workdir }}\channel\win-64
+
+      - name: Index the channel
+        shell: cmd /C CALL {0}
+        run: |
+          conda index ${{ env.workdir }}\channel
+
+      - name: Dump mkl_umath version info from created channel to STDOUT
+        shell: cmd /C CALL {0}
+        run: |
+          conda search ${{ env.PACKAGE_NAME }} -c ${{ env.workdir }}/channel --override-channels --info --json
+      - name: Dump mkl_umath version info from created channel into ver.json
+        shell: cmd /C CALL {0}
+        run: |
+          conda search ${{ env.PACKAGE_NAME }} -c ${{ env.workdir }}/channel --override-channels --info --json > ${{ env.workdir }}\ver.json
+      - name: Output content of workdir
+        shell: pwsh
+        run: Get-ChildItem -Path ${{ env.workdir }}
+      - name: Output content of produced ver.json
+        shell: pwsh
+        run: Get-Content -Path ${{ env.workdir }}\ver.json
+      - name: Collect dependencies
+        shell: cmd /C CALL {0}
+        run: |
+          IF NOT EXIST ver.json (
+              copy /Y ${{ env.workdir }}\ver.json .
+          )
+          SET "SCRIPT=%VER_SCRIPT1% %VER_SCRIPT2%"
+          FOR /F "tokens=* USEBACKQ" %%F IN (`python -c "%SCRIPT%"`) DO (
+             SET PACKAGE_VERSION=%%F
+          )
+          conda install -n mkl_umath_test ${{ env.PACKAGE_NAME }}=%PACKAGE_VERSION% python=${{ matrix.python }} -c ${{ env.workdir }}/channel ${{ env.CHANNELS }} --only-deps --dry-run > lockfile
+      - name: Display lockfile content
+        shell: pwsh
+        run: Get-Content -Path .\lockfile
+      - name: Cache conda packages
+        uses: actions/cache@v4
+        env:
+          CACHE_NUMBER: 0  # Increase to reset cache
+        with:
+          path: /home/runner/conda_pkgs_dir
+          key:
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-${{hashFiles('lockfile') }}
+          restore-keys: |
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
+      - name: Install mkl_umath
+        shell: cmd /C CALL {0}
+        run: |
+          @ECHO ON
+          IF NOT EXIST ver.json (
+              copy /Y ${{ env.workdir }}\ver.json .
+          )
+          set "SCRIPT=%VER_SCRIPT1% %VER_SCRIPT2%"
+          FOR /F "tokens=* USEBACKQ" %%F IN (`python -c "%SCRIPT%"`) DO (
+             SET PACKAGE_VERSION=%%F
+          )
+          SET "TEST_DEPENDENCIES=pytest pytest-cov"
+          conda install -n mkl_umath_test ${{ env.PACKAGE_NAME }}=%PACKAGE_VERSION% %TEST_DEPENDENCIES% python=${{ matrix.python }} -c ${{ env.workdir }}/channel ${{ env.CHANNELS }}
+      - name: Report content of test environment
+        shell: cmd /C CALL {0}
+        run: |
+          conda activate
+          echo "Value of CONDA enviroment variable was: " %CONDA%
+          echo "Value of CONDA_PREFIX enviroment variable was: " %CONDA_PREFIX%
+          conda info && conda list -n mkl_umath_test
+      - name: Run tests
+        shell: cmd /C CALL {0}
+        run: >-
+          conda activate mkl_umath_test && python -c "import mkl_umath, numpy as np; mkl_umath.use_in_numpy(); np.sin(np.linspace(0, 1, num=10**6));" 
+
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
new file mode 100644
index 0000000..586f7bc
--- /dev/null
+++ b/.github/workflows/openssf-scorecard.yml
@@ -0,0 +1,74 @@
+# This workflow uses actions that are not certified by GitHub. They are provided
+# by a third-party and are governed by separate terms of service, privacy
+# policy, and support documentation.
+
+name: Scorecard supply-chain security
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  # To guarantee Maintained check is occasionally updated. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+  schedule:
+    - cron: '28 2 * * 1'
+    - cron: '28 2 * * 4'
+  push:
+    branches: [ "master" ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write
+      # Uncomment the permissions below if installing in a private repository.
+      # contents: read
+      # actions: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
+          # - you want to enable the Branch-Protection check on a *public* repository, or
+          # - you are installing Scorecard on a *private* repository
+          # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
+          # repo_token: ${{ secrets.SCORECARD_TOKEN }}
+
+          # Public repositories:
+          #   - Publish results to OpenSSF REST API for easy access by consumers
+          #   - Allows the repository to include the Scorecard badge.
+          #   - See https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories:
+          #   - `publish_results` will always be set to `false`, regardless
+          #     of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 14
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8
+        with:
+          sarif_file: results.sarif
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7cc71d7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,96 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions and binary files
+*.o
+*.so
+*.so.*
+*.exe
+*.lib
+*.dll
+
+# CMake build and local install directory
+build
+_skbuild
+build_cmake
+install
+
+# Code project files
+.vscode
+
+# Eclipse project files
+.project
+.pydevproject
+
+# Emacs temp files
+*~
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+dpctl_conda_pkg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+junit.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# pyenv python configuration file
+.python-version
+
+_cmake_test_compile
+
+# generated numpy files
+mkl_umath/src/__umath_generated.c
+mkl_umath/src/mkl_umath_loops.c
+mkl_umath/src/mkl_umath_loops.h
+mkl_umath/src/_patch.c
+
+# moved cmake scripts
+dpctl/resources/cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..a4e3533
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,145 @@
+cmake_minimum_required(VERSION 3.27...3.28 FATAL_ERROR)
+
+cmake_policy(SET CMP0135 NEW)
+
+project(mkl_umath
+  LANGUAGES C
+  DESCRIPTION "mkl_umath module"
+)
+
+option(OPTIMIZATION_REPORT
+  "Whether to generate optimization vectorization report"
+  OFF
+)
+
+find_package(Python COMPONENTS Interpreter Development NumPy REQUIRED)
+
+# Print out the discovered paths
+include(CMakePrintHelpers)
+cmake_print_variables(Python_INCLUDE_DIRS)
+cmake_print_variables(Python_LIBRARIES)
+cmake_print_variables(Python_NumPy_INCLUDE_DIRS)
+
+set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"")
+find_package(Cython REQUIRED)
+
+set(MKL_LINK sdl)
+find_package(MKL REQUIRED)
+
+if(WIN32)
+ string(CONCAT WARNING_FLAGS
+     "-Wall "
+     "-Wextra "
+     "-Winit-self "
+     "-Wunused-function "
+     "-Wuninitialized "
+     "-Wmissing-declarations "
+     "-Wstrict-prototypes "
+     "-Wno-unused-parameter "
+     "-Wno-implicit-function-declaration "
+   )
+   string(CONCAT SDL_FLAGS
+     "/GS "
+     "/DynamicBase "
+   )
+   string(CONCAT PRECISION_FLAGS
+     "/fp:fast=2 "
+     "/Qimf-precision=high "
+     "/Qprec-sqrt "
+     "/Qprotect-parens "
+   )
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS} ${PRECISION_FLAGS}")
+   set(CMAKE_C_FLAGS_DEBUG
+     "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG"
+   )
+  set(MKL_UMATH_LINKER_OPTIONS "LINKER:/NXCompat;LINKER:/DynamicBase")
+elseif(UNIX)
+   string(CONCAT WARNING_FLAGS
+     "-Wall "
+     "-Wextra "
+     "-Winit-self "
+     "-Wunused-function "
+     "-Wuninitialized "
+     "-Wmissing-declarations "
+     "-Wstrict-prototypes "
+     "-Wno-unused-parameter "
+     "-fdiagnostics-color=auto "
+   )
+   string(CONCAT SDL_FLAGS
+     "-fstack-protector "
+     "-fstack-protector-all "
+     "-fpic "
+     "-fPIC "
+     "-D_FORTIFY_SOURCE=2 "
+     "-Wformat "
+     "-Wformat-security "
+#     "-fno-strict-overflow "    # no-strict-overflow is implied by -fwrapv
+     "-fno-delete-null-pointer-checks "
+     "-fwrapv "
+   )
+   string(CONCAT CFLAGS
+     "${WARNING_FLAGS}"
+     "${SDL_FLAGS}"
+   )
+   string(CONCAT PRECISION_FLAGS
+     "-prec-sqrt "
+     "-fprotect-parens "
+     "-fimf-precision=high "
+     "-fp-model fast=2 "
+   )
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS} ${PRECISION_FLAGS}")
+   set(CMAKE_C_FLAGS_DEBUG
+     "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g1 -DDEBUG"
+   )
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-incompatible-function-pointer-types ${CFLAGS}")
+  set(MKL_UMATH_LINKER_OPTIONS "LINKER:-z,noexecstack,-z,relro,-z,now")
+else()
+  message(FATAL_ERROR "Unsupported system.")
+endif()
+
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
+# set_property(GLOBAL PROPERTY GLOBAL_DEPENDS_DEBUG_MODE 1)
+set(_linker_options ${MKL_UMATH_LINKER_OPTIONS})
+
+set(_trgt mkl_umath_loops)
+add_library(${_trgt} SHARED mkl_umath/src/mkl_umath_loops.c)
+set_target_properties(${_trgt} PROPERTIES
+    CMAKE_POSITION_INDEPENDENT_CODE ON
+    C_STANDARD 99
+)
+target_include_directories(${_trgt} PUBLIC mkl_umath/src/ ${Python_NumPy_INCLUDE_DIRS} ${Python_INCLUDE_DIRS})
+target_link_libraries(${_trgt} PUBLIC MKL::MKL ${Python_LIBRARIES})
+target_link_options(${_trgt} PUBLIC ${_linker_options})
+target_compile_options(${_trgt} PUBLIC -fveclib=SVML)
+target_compile_options(${_trgt} PUBLIC -fvectorize)
+if(OPTIMIZATION_REPORT)
+  target_compile_options(${_trgt} PRIVATE -qopt-report=3)
+endif()
+install(TARGETS ${_trgt}
+  LIBRARY DESTINATION mkl_umath
+  ARCHIVE DESTINATION mkl_umath
+  RUNTIME DESTINATION mkl_umath
+)
+
+python_add_library(_ufuncs MODULE WITH_SOABI "mkl_umath/src/ufuncsmodule.c" "mkl_umath/src/__umath_generated.c")
+target_include_directories(_ufuncs PRIVATE "mkl_umath/src" ${Python_NumPy_INCLUDE_DIRS} ${MKL_INCLUDE_DIR})
+target_compile_definitions(_ufuncs PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
+target_link_options(_ufuncs PRIVATE ${_linker_options})
+target_link_libraries(_ufuncs PRIVATE mkl_umath_loops)
+set_target_properties(_ufuncs PROPERTIES C_STANDARD 99)
+if (UNIX)
+  set_target_properties(_ufuncs PROPERTIES INSTALL_RPATH "$ORIGIN")
+endif()
+install(TARGETS _ufuncs LIBRARY DESTINATION mkl_umath)
+
+add_cython_target(_patch "mkl_umath/src/_patch.pyx" C OUTPUT_VAR _generated_src)
+Python_add_library(_patch MODULE WITH_SOABI ${_generated_src})
+target_include_directories(_patch PRIVATE "mkl_umath/src/" ${Python_NumPy_INCLUDE_DIRS} ${Python_INCLUDE_DIRS})
+target_compile_definitions(_patch PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
+target_link_libraries(_patch PRIVATE mkl_umath_loops)
+set_target_properties(_patch PROPERTIES C_STANDARD 99)
+if (UNIX)
+  set_target_properties(_patch PROPERTIES INSTALL_RPATH "$ORIGIN")
+endif()
+install(TARGETS _patch LIBRARY DESTINATION mkl_umath)
diff --git a/README.md b/README.md
index a9f571c..0e2bd0b 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Patches were factored out per community feedback ([NEP-36](https://numpy.org/nep
 as a stand-alone package. It can be installed into conda environment using 
 
 ```
-   conda install -c intel mkl_umath
+   conda install -c https://software.repos.intel.com/python/conda mkl_umath
 ```
 
 ---
@@ -18,17 +18,9 @@ as a stand-alone package. It can be installed into conda environment using
 To install mkl_umath Pypi package please use following command:
 
 ```
-   python -m pip install --i https://pypi.anaconda.org/intel/simple -extra-index-url https://pypi.org/simple mkl_umath
+   python -m pip install mkl_umath
 ```
 
-If command above installs NumPy package from the Pypi, please use following command to install Intel optimized NumPy wheel package from Anaconda Cloud:
-
-```
-   python -m pip install --i https://pypi.anaconda.org/intel/simple -extra-index-url https://pypi.org/simple mkl_umath numpy==<numpy_version>
-```
-
-Where `<numpy_version>` should be the latest version from https://anaconda.org/intel/numpy
-
 ---
 
 ## Building
@@ -36,7 +28,7 @@ Where `<numpy_version>` should be the latest version from https://anaconda.org/i
 Intel(R) C compiler and Intel(R) Math Kernel Library are required to build `mkl_umath` from source:
 
 ```sh
-# ensure that MKL is installed, icc is activated
+# ensure that MKL is installed into Python prefix, Intel LLVM compiler is activated
 export MKLROOT=$CONDA_PREFIX
-python setup.py config_cc --compiler=intelem build_ext --inplace
+CC=icx pip install --no-build-isolation --no-deps -e .
 ```
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..556938b
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,12 @@
+# Security Policy
+
+## Report a Vulnerability
+
+Please report security issues or vulnerabilities to the [Intel® Security Center].
+
+For more information on how Intel® works to resolve security issues, see
+[Vulnerability Handling Guidelines].
+
+[Intel® Security Center]:https://www.intel.com/content/www/us/en/security-center/default.html
+
+[Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html
diff --git a/_vendored/README.md b/_vendored/README.md
new file mode 100644
index 0000000..0ebafcb
--- /dev/null
+++ b/_vendored/README.md
@@ -0,0 +1,5 @@
+## Vendored files
+
+File `conv_template.py` is copied from NumPy's numpy/distutils folder, since
+`numpy.distutils` is absent from the installation layout starting with
+Python 3.12
\ No newline at end of file
diff --git a/_vendored/__init__.py b/_vendored/__init__.py
new file mode 100644
index 0000000..fa81ada
--- /dev/null
+++ b/_vendored/__init__.py
@@ -0,0 +1 @@
+# empty file
diff --git a/_vendored/conv_template.py b/_vendored/conv_template.py
new file mode 100644
index 0000000..c8933d1
--- /dev/null
+++ b/_vendored/conv_template.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+"""
+takes templated file .xxx.src and produces .xxx file  where .xxx is
+.i or .c or .h, using the following template rules
+
+/**begin repeat  -- on a line by itself marks the start of a repeated code
+                    segment
+/**end repeat**/ -- on a line by itself marks it's end
+
+After the /**begin repeat and before the */, all the named templates are placed
+these should all have the same number of replacements
+
+Repeat blocks can be nested, with each nested block labeled with its depth,
+i.e.
+/**begin repeat1
+ *....
+ */
+/**end repeat1**/
+
+When using nested loops, you can optionally exclude particular
+combinations of the variables using (inside the comment portion of the inner loop):
+
+ :exclude: var1=value1, var2=value2, ...
+
+This will exclude the pattern where var1 is value1 and var2 is value2 when
+the result is being generated.
+
+
+In the main body each replace will use one entry from the list of named replacements
+
+ Note that all #..# forms in a block must have the same number of
+   comma-separated entries.
+
+Example:
+
+    An input file containing
+
+        /**begin repeat
+         * #a = 1,2,3#
+         * #b = 1,2,3#
+         */
+
+        /**begin repeat1
+         * #c = ted, jim#
+         */
+        @a@, @b@, @c@
+        /**end repeat1**/
+
+        /**end repeat**/
+
+    produces
+
+        line 1 "template.c.src"
+
+        /*
+         *********************************************************************
+         **       This file was autogenerated from a template  DO NOT EDIT!!**
+         **       Changes should be made to the original source (.src) file **
+         *********************************************************************
+         */
+
+        #line 9
+        1, 1, ted
+
+        #line 9
+        1, 1, jim
+
+        #line 9
+        2, 2, ted
+
+        #line 9
+        2, 2, jim
+
+        #line 9
+        3, 3, ted
+
+        #line 9
+        3, 3, jim
+
+"""
+
+__all__ = ['process_str', 'process_file']
+
+import os
+import sys
+import re
+
+# names for replacement that are already global.
+global_names = {}
+
+# header placed at the front of head processed file
+header =\
+"""
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+"""
+# Parse string for repeat loops
+def parse_structure(astr, level):
+    """
+    The returned line number is from the beginning of the string, starting
+    at zero. Returns an empty list if no loops found.
+
+    """
+    if level == 0 :
+        loopbeg = "/**begin repeat"
+        loopend = "/**end repeat**/"
+    else :
+        loopbeg = "/**begin repeat%d" % level
+        loopend = "/**end repeat%d**/" % level
+
+    ind = 0
+    line = 0
+    spanlist = []
+    while True:
+        start = astr.find(loopbeg, ind)
+        if start == -1:
+            break
+        start2 = astr.find("*/", start)
+        start2 = astr.find("\n", start2)
+        fini1 = astr.find(loopend, start2)
+        fini2 = astr.find("\n", fini1)
+        line += astr.count("\n", ind, start2+1)
+        spanlist.append((start, start2+1, fini1, fini2+1, line))
+        line += astr.count("\n", start2+1, fini2)
+        ind = fini2
+    spanlist.sort()
+    return spanlist
+
+
+def paren_repl(obj):
+    torep = obj.group(1)
+    numrep = obj.group(2)
+    return ','.join([torep]*int(numrep))
+
+parenrep = re.compile(r"\(([^)]*)\)\*(\d+)")
+plainrep = re.compile(r"([^*]+)\*(\d+)")
+def parse_values(astr):
+    # replaces all occurrences of '(a,b,c)*4' in astr
+    # with 'a,b,c,a,b,c,a,b,c,a,b,c'. Empty braces generate
+    # empty values, i.e., ()*4 yields ',,,'. The result is
+    # split at ',' and a list of values returned.
+    astr = parenrep.sub(paren_repl, astr)
+    # replaces occurrences of xxx*3 with xxx, xxx, xxx
+    astr = ','.join([plainrep.sub(paren_repl, x.strip())
+                     for x in astr.split(',')])
+    return astr.split(',')
+
+
+stripast = re.compile(r"\n\s*\*?")
+named_re = re.compile(r"#\s*(\w*)\s*=([^#]*)#")
+exclude_vars_re = re.compile(r"(\w*)=(\w*)")
+exclude_re = re.compile(":exclude:")
+def parse_loop_header(loophead) :
+    """Find all named replacements in the header
+
+    Returns a list of dictionaries, one for each loop iteration,
+    where each key is a name to be substituted and the corresponding
+    value is the replacement string.
+
+    Also return a list of exclusions.  The exclusions are dictionaries
+     of key value pairs. There can be more than one exclusion.
+     [{'var1':'value1', 'var2', 'value2'[,...]}, ...]
+
+    """
+    # Strip out '\n' and leading '*', if any, in continuation lines.
+    # This should not effect code previous to this change as
+    # continuation lines were not allowed.
+    loophead = stripast.sub("", loophead)
+    # parse out the names and lists of values
+    names = []
+    reps = named_re.findall(loophead)
+    nsub = None
+    for rep in reps:
+        name = rep[0]
+        vals = parse_values(rep[1])
+        size = len(vals)
+        if nsub is None :
+            nsub = size
+        elif nsub != size :
+            msg = "Mismatch in number of values, %d != %d\n%s = %s"
+            raise ValueError(msg % (nsub, size, name, vals))
+        names.append((name, vals))
+
+
+    # Find any exclude variables
+    excludes = []
+
+    for obj in exclude_re.finditer(loophead):
+        span = obj.span()
+        # find next newline
+        endline = loophead.find('\n', span[1])
+        substr = loophead[span[1]:endline]
+        ex_names = exclude_vars_re.findall(substr)
+        excludes.append(dict(ex_names))
+
+    # generate list of dictionaries, one for each template iteration
+    dlist = []
+    if nsub is None :
+        raise ValueError("No substitution variables found")
+    for i in range(nsub):
+        tmp = {name: vals[i] for name, vals in names}
+        dlist.append(tmp)
+    return dlist
+
+replace_re = re.compile(r"@(\w+)@")
+def parse_string(astr, env, level, line) :
+    lineno = "#line %d\n" % line
+
+    # local function for string replacement, uses env
+    def replace(match):
+        name = match.group(1)
+        try :
+            val = env[name]
+        except KeyError:
+            msg = 'line %d: no definition of key "%s"'%(line, name)
+            raise ValueError(msg) from None
+        return val
+
+    code = [lineno]
+    struct = parse_structure(astr, level)
+    if struct :
+        # recurse over inner loops
+        oldend = 0
+        newlevel = level + 1
+        for sub in struct:
+            pref = astr[oldend:sub[0]]
+            head = astr[sub[0]:sub[1]]
+            text = astr[sub[1]:sub[2]]
+            oldend = sub[3]
+            newline = line + sub[4]
+            code.append(replace_re.sub(replace, pref))
+            try :
+                envlist = parse_loop_header(head)
+            except ValueError as e:
+                msg = "line %d: %s" % (newline, e)
+                raise ValueError(msg)
+            for newenv in envlist :
+                newenv.update(env)
+                newcode = parse_string(text, newenv, newlevel, newline)
+                code.extend(newcode)
+        suff = astr[oldend:]
+        code.append(replace_re.sub(replace, suff))
+    else :
+        # replace keys
+        code.append(replace_re.sub(replace, astr))
+    code.append('\n')
+    return ''.join(code)
+
+def process_str(astr):
+    code = [header]
+    code.extend(parse_string(astr, global_names, 0, 1))
+    return ''.join(code)
+
+
+include_src_re = re.compile(r"(\n|\A)#include\s*['\"]"
+                            r"(?P<name>[\w\d./\\]+[.]src)['\"]", re.I)
+
+def resolve_includes(source):
+    d = os.path.dirname(source)
+    with open(source) as fid:
+        lines = []
+        for line in fid:
+            m = include_src_re.match(line)
+            if m:
+                fn = m.group('name')
+                if not os.path.isabs(fn):
+                    fn = os.path.join(d, fn)
+                if os.path.isfile(fn):
+                    lines.extend(resolve_includes(fn))
+                else:
+                    lines.append(line)
+            else:
+                lines.append(line)
+    return lines
+
+def process_file(source):
+    lines = resolve_includes(source)
+    sourcefile = os.path.normcase(source).replace("\\", "\\\\")
+    try:
+        code = process_str(''.join(lines))
+    except ValueError as e:
+        raise ValueError('In "%s" loop at %s' % (sourcefile, e)) from None
+    return '#line 1 "%s"\n%s' % (sourcefile, code)
+
+
+def unique_key(adict):
+    # this obtains a unique key given a dictionary
+    # currently it works by appending together n of the letters of the
+    #   current keys and increasing n until a unique key is found
+    # -- not particularly quick
+    allkeys = list(adict.keys())
+    done = False
+    n = 1
+    while not done:
+        newkey = "".join([x[:n] for x in allkeys])
+        if newkey in allkeys:
+            n += 1
+        else:
+            done = True
+    return newkey
+
+
+def main():
+    try:
+        file = sys.argv[1]
+    except IndexError:
+        fid = sys.stdin
+        outfile = sys.stdout
+    else:
+        fid = open(file, 'r')
+        (base, ext) = os.path.splitext(file)
+        newname = base
+        outfile = open(newname, 'w')
+
+    allstr = fid.read()
+    try:
+        writestr = process_str(allstr)
+    except ValueError as e:
+        raise ValueError("In %s loop at %s" % (file, e)) from None
+
+    outfile.write(writestr)
+
+if __name__ == "__main__":
+    main()
diff --git a/conda-recipe-cf/bld.bat b/conda-recipe-cf/bld.bat
new file mode 100644
index 0000000..e27318d
--- /dev/null
+++ b/conda-recipe-cf/bld.bat
@@ -0,0 +1,25 @@
+REM A workaround for activate-dpcpp.bat issue to be addressed in 2021.4
+set "LIB=%BUILD_PREFIX%\Library\lib;%BUILD_PREFIX%\compiler\lib;%LIB%"
+set "INCLUDE=%BUILD_PREFIX%\include;%INCLUDE%"
+
+"%PYTHON%" setup.py clean --all
+set "SKBUILD_ARGS=-G Ninja -- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+
+FOR %%V IN (14.0.0 14 15.0.0 15 16.0.0 16 17.0.0 17) DO @(
+  REM set DIR_HINT if directory exists
+  IF EXIST "%BUILD_PREFIX%\Library\lib\clang\%%V\" (
+     SET "SYCL_INCLUDE_DIR_HINT=%BUILD_PREFIX%\Library\lib\clang\%%V"
+  )
+)
+
+if NOT "%WHEELS_OUTPUT_FOLDER%"=="" (
+    rem Install and assemble wheel package from the build bits
+    "%PYTHON%" setup.py install bdist_wheel %SKBUILD_ARGS%
+    if errorlevel 1 exit 1
+    copy dist\mkl_umath*.whl %WHEELS_OUTPUT_FOLDER%
+    if errorlevel 1 exit 1
+) ELSE (
+    rem Only install
+    "%PYTHON%" setup.py install %SKBUILD_ARGS%
+    if errorlevel 1 exit 1
+)
diff --git a/conda-recipe-cf/build.sh b/conda-recipe-cf/build.sh
new file mode 100644
index 0000000..2792f27
--- /dev/null
+++ b/conda-recipe-cf/build.sh
@@ -0,0 +1,19 @@
+# This is necessary to help DPC++ find Intel libraries such as SVML, IRNG, etc in build prefix
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${BUILD_PREFIX}/lib"
+
+# Intel LLVM must cooperate with compiler and sysroot from conda
+echo "--gcc-toolchain=${BUILD_PREFIX} --sysroot=${BUILD_PREFIX}/${HOST}/sysroot -target ${HOST}" > icx_for_conda.cfg
+export ICXCFG="$(pwd)/icx_for_conda.cfg"
+
+export CMAKE_GENERATOR="Ninja"
+SKBUILD_ARGS="-- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+
+if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then
+    # Install packages and assemble wheel package from built bits
+    WHEELS_BUILD_ARGS="-p manylinux_${GLIBC_MAJOR}_${GLIBC_MINOR}_x86_64"
+    ${PYTHON} setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS}
+    cp dist/mkl_umath*.whl ${WHEELS_OUTPUT_FOLDER}
+else
+    # Perform regular install
+    ${PYTHON} setup.py install ${SKBUILD_ARGS}
+fi
diff --git a/conda-recipe-cf/meta.yaml b/conda-recipe-cf/meta.yaml
new file mode 100644
index 0000000..4ecf657
--- /dev/null
+++ b/conda-recipe-cf/meta.yaml
@@ -0,0 +1,54 @@
+{% set version = "0.1.2" %}
+{% set buildnumber = 0 %}
+
+package:
+    name: mkl_umath
+    version: {{ version }}
+
+source:
+    path: ../
+
+build:
+    number: {{ buildnumber }}
+    ignore_run_exports:
+      - blas
+
+requirements:
+    build:
+      - {{ compiler('c') }}
+      - {{ compiler('cxx') }}
+      - {{ compiler('dpcpp') }} >=2024.2  # [not osx]
+      - sysroot_linux-64 >=2.28  # [linux]
+    host:
+      - setuptools
+      - cmake
+      - ninja
+      - git
+      - cython
+      - scikit-build
+      - python
+      - mkl-devel
+      - numpy
+    run:
+      - python
+      - mkl
+      - mkl-service
+      - {{ pin_compatible('intel-cmplr-lib-rt') }}
+
+test:
+    requires:
+      - pytest
+    source_files:
+      - mkl_umath/tests/test_basic.py
+    commands:
+      - pytest mkl_umath/tests/test_basic.py
+    imports:
+      - mkl_umath
+      - mkl_umath._ufuncs
+      - mkl_umath._patch
+
+about:
+    home: http://github.com/IntelPython/mkl_umath
+    license: BSD-3
+    license_file: LICENSE.txt
+    summary: Universal functions for real and complex floating point arrays powered by Intel(R) Math Kernel Library Vector (Intel(R) MKL) and Intel(R) Short Vector Math Library (Intel(R) SVML)
diff --git a/conda-recipe-cf/run_tests.bat b/conda-recipe-cf/run_tests.bat
new file mode 100644
index 0000000..590db89
--- /dev/null
+++ b/conda-recipe-cf/run_tests.bat
@@ -0,0 +1 @@
+%PYTHON% tests\test_basic.py
\ No newline at end of file
diff --git a/conda-recipe-cf/run_tests.sh b/conda-recipe-cf/run_tests.sh
new file mode 100644
index 0000000..7bfca5d
--- /dev/null
+++ b/conda-recipe-cf/run_tests.sh
@@ -0,0 +1 @@
+$PYTHON tests/test_basic.py
diff --git a/conda-recipe/bld.bat b/conda-recipe/bld.bat
new file mode 100644
index 0000000..e27318d
--- /dev/null
+++ b/conda-recipe/bld.bat
@@ -0,0 +1,25 @@
+REM A workaround for activate-dpcpp.bat issue to be addressed in 2021.4
+set "LIB=%BUILD_PREFIX%\Library\lib;%BUILD_PREFIX%\compiler\lib;%LIB%"
+set "INCLUDE=%BUILD_PREFIX%\include;%INCLUDE%"
+
+"%PYTHON%" setup.py clean --all
+set "SKBUILD_ARGS=-G Ninja -- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+
+FOR %%V IN (14.0.0 14 15.0.0 15 16.0.0 16 17.0.0 17) DO @(
+  REM set DIR_HINT if directory exists
+  IF EXIST "%BUILD_PREFIX%\Library\lib\clang\%%V\" (
+     SET "SYCL_INCLUDE_DIR_HINT=%BUILD_PREFIX%\Library\lib\clang\%%V"
+  )
+)
+
+if NOT "%WHEELS_OUTPUT_FOLDER%"=="" (
+    rem Install and assemble wheel package from the build bits
+    "%PYTHON%" setup.py install bdist_wheel %SKBUILD_ARGS%
+    if errorlevel 1 exit 1
+    copy dist\mkl_umath*.whl %WHEELS_OUTPUT_FOLDER%
+    if errorlevel 1 exit 1
+) ELSE (
+    rem Only install
+    "%PYTHON%" setup.py install %SKBUILD_ARGS%
+    if errorlevel 1 exit 1
+)
diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh
new file mode 100644
index 0000000..2792f27
--- /dev/null
+++ b/conda-recipe/build.sh
@@ -0,0 +1,19 @@
+# This is necessary to help DPC++ find Intel libraries such as SVML, IRNG, etc in build prefix
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${BUILD_PREFIX}/lib"
+
+# Intel LLVM must cooperate with compiler and sysroot from conda
+echo "--gcc-toolchain=${BUILD_PREFIX} --sysroot=${BUILD_PREFIX}/${HOST}/sysroot -target ${HOST}" > icx_for_conda.cfg
+export ICXCFG="$(pwd)/icx_for_conda.cfg"
+
+export CMAKE_GENERATOR="Ninja"
+SKBUILD_ARGS="-- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+
+if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then
+    # Install packages and assemble wheel package from built bits
+    WHEELS_BUILD_ARGS="-p manylinux_${GLIBC_MAJOR}_${GLIBC_MINOR}_x86_64"
+    ${PYTHON} setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS}
+    cp dist/mkl_umath*.whl ${WHEELS_OUTPUT_FOLDER}
+else
+    # Perform regular install
+    ${PYTHON} setup.py install ${SKBUILD_ARGS}
+fi
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
new file mode 100644
index 0000000..dcafd45
--- /dev/null
+++ b/conda-recipe/meta.yaml
@@ -0,0 +1,55 @@
+{% set version = "0.1.2" %}
+{% set buildnumber = 0 %}
+
+package:
+    name: mkl_umath
+    version: {{ version }}
+
+source:
+    path: ../
+
+build:
+    number: {{ buildnumber }}
+    ignore_run_exports:
+      - blas
+
+requirements:
+    build:
+      - {{ compiler('c') }}
+      - {{ compiler('cxx') }}
+      - {{ compiler('dpcpp') }} >=2024.2  # [not osx]
+      - sysroot_linux-64 >=2.28  # [linux]
+    host:
+      - setuptools
+      - cmake
+      - ninja
+      - git
+      - cython
+      - scikit-build
+      - python
+      - mkl-devel
+      - numpy-base
+    run:
+      - python
+      - mkl
+      - mkl-service
+      - {{ pin_compatible('intel-cmplr-lib-rt') }}
+      - {{ pin_compatible('numpy') }}
+
+test:
+    requires:
+      - pytest
+    source_files:
+      - mkl_umath/tests/test_basic.py
+    commands:
+      - pytest mkl_umath/tests/test_basic.py
+    imports:
+      - mkl_umath
+      - mkl_umath._ufuncs
+      - mkl_umath._patch
+
+about:
+    home: http://github.com/IntelPython/mkl_umath
+    license: BSD-3
+    license_file: LICENSE.txt
+    summary: Universal functions for real and complex floating point arrays powered by Intel(R) Math Kernel Library Vector (Intel(R) MKL) and Intel(R) Short Vector Math Library (Intel(R) SVML)
diff --git a/conda-recipe/run_tests.bat b/conda-recipe/run_tests.bat
new file mode 100644
index 0000000..590db89
--- /dev/null
+++ b/conda-recipe/run_tests.bat
@@ -0,0 +1 @@
+%PYTHON% tests\test_basic.py
\ No newline at end of file
diff --git a/conda-recipe/run_tests.sh b/conda-recipe/run_tests.sh
new file mode 100644
index 0000000..7bfca5d
--- /dev/null
+++ b/conda-recipe/run_tests.sh
@@ -0,0 +1 @@
+$PYTHON tests/test_basic.py
diff --git a/mkl_umath/__init__.py b/mkl_umath/__init__.py
index 92960ad..a6e2927 100644
--- a/mkl_umath/__init__.py
+++ b/mkl_umath/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
diff --git a/mkl_umath/_version.py b/mkl_umath/_version.py
index df9144c..10939f0 100644
--- a/mkl_umath/_version.py
+++ b/mkl_umath/_version.py
@@ -1 +1 @@
-__version__ = '0.1.1'
+__version__ = '0.1.2'
diff --git a/mkl_umath/generate_umath.py b/mkl_umath/generate_umath.py
index 7ff39b2..e6609ab 100644
--- a/mkl_umath/generate_umath.py
+++ b/mkl_umath/generate_umath.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@@ -343,12 +343,6 @@ def english_upper(s):
           None, 
           TD(inexactvec + cmplxvec),
           ),
-'floor_divide':
-    Ufunc(2, 1, None,
-          docstrings.get('numpy.core.umath.floor_divide'),
-          None,
-          TD(inexactvec + cmplxvec),
-          ),
 'true_divide':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.true_divide'),
@@ -797,16 +791,16 @@ def make_arrays(funcdict):
                 tname = english_upper(chartoname[t.type])
                 datalist.append('(void *)NULL')
                 funclist.append(
-                        '%s_%s_%s_%s' % (tname, t.in_, t.out, name))
+                        'mkl_umath_%s_%s_%s_%s' % (tname, t.in_, t.out, name))
             elif isinstance(t.func_data, FuncNameSuffix):
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
                 funclist.append(
-                        '%s_%s_%s' % (tname, name, t.func_data.suffix))
+                        'mkl_umath_%s_%s_%s' % (tname, name, t.func_data.suffix))
             elif t.func_data is None:
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
-                funclist.append('%s_%s' % (tname, name))
+                funclist.append('mkl_umath_%s_%s' % (tname, name))
                 if t.simd is not None:
                     for vt in t.simd:
                         code2list.append(textwrap.dedent("""\
@@ -936,8 +930,10 @@ def make_code(funcdict, filename):
         Please make changes to the code generator program (%s)
     **/
     #include "Python.h"
+    #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+    #include "numpy/arrayobject.h"
     #include "numpy/ufuncobject.h"
-    #include "loops_intel.h"
+    #include "mkl_umath_loops.h"
     %s
 
     static int
diff --git a/mkl_umath/setup.py b/mkl_umath/setup.py
deleted file mode 100644
index 81a77bf..0000000
--- a/mkl_umath/setup.py
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2019-2021, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from os import (getcwd, environ, makedirs)
-from os.path import join, exists, abspath, dirname
-import importlib.machinery # requires Python >= 3.4
-from distutils.dep_util import newer
-
-from numpy.distutils.ccompiler import new_compiler
-from distutils.sysconfig import customize_compiler
-import platform
-from numpy import get_include as get_numpy_include
-from distutils.sysconfig import get_python_inc as get_python_include
-
-def ensure_Intel_compiler():
-    ccompiler = new_compiler()
-    customize_compiler(ccompiler)
-    if hasattr(ccompiler, 'compiler'):
-        compiler_name = ccompiler.compiler[0]
-    else:
-        compiler_name = ccompiler.__class__.__name__
-
-    assert ('icl' in compiler_name or 'icc' in compiler_name), \
-        "Intel(R) C Compiler is required to build mkl_umath, found {}".format(compiler_name)
-    
-
-def load_module(name, fn):
-    """
-    Credit: numpy.compat.npy_load_module
-    """
-    return importlib.machinery.SourceFileLoader(name, fn).load_module()
-
-
-def separator_join(sep, strs):
-    """
-    Joins non-empty arguments strings with dot.
-
-    Credit: numpy.distutils.misc_util.dot_join
-    """
-    assert isinstance(strs, (list, tuple))
-    assert isinstance(sep, str)
-    return sep.join([si for si in strs if si])
-
-
-def configuration(parent_package='',top_path=None):
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    config = Configuration('mkl_umath', parent_package, top_path)
-
-    mkl_root = environ.get('MKLROOT', None)
-    if mkl_root:
-        mkl_info = {
-            'include_dirs': [join(mkl_root, 'include')],
-            'library_dirs': [join(mkl_root, 'lib'), join(mkl_root, 'lib', 'intel64')],
-            'libraries': ['mkl_rt']
-        }
-    else:
-        mkl_info = get_info('mkl')
-
-    print(mkl_info)
-    mkl_include_dirs = mkl_info.get('include_dirs', [])
-    mkl_library_dirs = mkl_info.get('library_dirs', [])
-    mkl_libraries = mkl_info.get('libraries', ['mkl_rt'])
-
-    pdir = dirname(__file__)
-    wdir = join(pdir, 'src')
-    mkl_info = get_info('mkl')
-
-    generate_umath_py = join(pdir, 'generate_umath.py')
-    n = separator_join('_', (config.name, 'generate_umath'))
-    generate_umath = load_module(n, generate_umath_py)
-    del n
-
-    def generate_umath_c(ext, build_dir):
-        target_dir = join(build_dir, 'src')
-        target = join(target_dir, '__umath_generated.c')
-        if not exists(target_dir):
-            print("Folder {} was expected to exist, but creating".format(target_dir))
-            makedirs(target_dir)
-        script = generate_umath_py
-        if newer(script, target):
-            with open(target, 'w') as f:
-                f.write(generate_umath.make_code(generate_umath.defdict,
-                                                 generate_umath.__file__))
-        config.add_include_dirs(target_dir)
-        return []
-
-    sources = [generate_umath_c]
-
-    # ensure_Intel_compiler()
-
-    if platform.system() == "Windows":
-        eca = ['/fp:fast=2', '/Qimf-precision=high', '/Qprec-sqrt', '/Qstd=c99', '/Qprotect-parens']
-    else:
-        eca = ['-fp-model', 'fast=2', '-fimf-precision=high', '-prec-sqrt', '-fprotect-parens']
-
-    numpy_include_dir = get_numpy_include()
-    python_include_dir = get_python_include()
-    config.add_library(
-        'loops_intel',
-        sources = [
-            join(wdir, 'loops_intel.h.src'),
-            join(wdir, 'loops_intel.c.src'),
-        ],
-        include_dirs = [wdir] + mkl_include_dirs + [numpy_include_dir, python_include_dir],
-        depends = [
-            join(wdir, 'blocking_utils.h'),
-            join(wdir, 'fast_loop_macros.h'),
-            join(numpy_include_dir, 'numpy', '*object.h'),
-            join(python_include_dir, "Python.h")
-        ],
-        libraries=mkl_libraries,
-        extra_compiler_args=eca,
-        macros=getattr(config, 'define_macros', getattr(config.get_distribution(), 'define_macros', []))
-    )
-
-    config.add_extension(
-        name = '_ufuncs',
-        sources = [
-            join(wdir, 'ufuncsmodule.c'),
-        ] + sources,
-        depends = [
-            join(wdir, 'loops_intel.c.src'),
-            join(wdir, 'loops_intel.h.src'),
-        ],
-        include_dirs = [wdir] + mkl_include_dirs,
-        libraries = mkl_libraries + ['loops_intel'],
-        library_dirs = mkl_library_dirs,
-        extra_compile_args = [
-            '-DNDEBUG',
-            # '-ggdb', '-O0', '-Wall', '-Wextra', '-DDEBUG',
-        ]
-    )
-
-    from Cython.Build import cythonize
-    from setuptools import Extension
-    cythonize(Extension('_patch', sources=[join(wdir, 'patch.pyx'),]))
-
-    config.add_extension(
-        name = '_patch',
-        sources = [
-            join(wdir, 'patch.c'),
-        ],
-        libraries = mkl_libraries + ['loops_intel'],
-        library_dirs = mkl_library_dirs,
-        extra_compile_args = [
-            '-DNDEBUG',
-            #'-ggdb', '-O0', '-Wall', '-Wextra', '-DDEBUG',
-        ]
-    )
-
-    config.add_data_dir('tests')
-
-#    if have_cython:
-#        config.ext_modules = cythonize(config.ext_modules, include_path=[pdir, wdir])
-
-    return config
-
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)
diff --git a/mkl_umath/src/patch.pyx b/mkl_umath/src/_patch.pyx
similarity index 99%
rename from mkl_umath/src/patch.pyx
rename to mkl_umath/src/_patch.pyx
index 5814d54..fd78f8d 100644
--- a/mkl_umath/src/patch.pyx
+++ b/mkl_umath/src/_patch.pyx
@@ -24,7 +24,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 # distutils: language = c
-# cython: language_level=2
+# cython: language_level=3
 
 import mkl_umath._ufuncs as mu
 import numpy.core.umath as nu
diff --git a/mkl_umath/src/fast_loop_macros.h b/mkl_umath/src/fast_loop_macros.h
index 50f9d41..12ef2e1 100644
--- a/mkl_umath/src/fast_loop_macros.h
+++ b/mkl_umath/src/fast_loop_macros.h
@@ -41,6 +41,10 @@
 #define NPY_PRAGMA_VECTOR _Pragma("vector")
 #define NPY_PRAGMA_NOVECTOR _Pragma("novector")
 #define NPY_ASSUME_ALIGNED(p, b) __assume_aligned((p), (b));
+#elif defined(__clang__)
+#define NPY_PRAGMA_VECTOR _Pragma("clang loop vectorize(enable)")
+#define NPY_PRAGMA_NOVECTOR _Pragma("clang loop vectorize(disable)")
+#define NPY_ASSUME_ALIGNED(p, b)
 #else
 #define NPY_PRAGMA_VECTOR _Pragma("GCC ivdep")
 #define NPY_PRAGMA_NOVECTOR
@@ -70,19 +74,19 @@
     npy_intp is1 = steps[0], os1 = steps[1];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, op1 += os1)
+    for(i = 0; i < n; ++i, ip1 += is1, op1 += os1)
 
-#define UNARY_LOOP_VECTORIZED\
-    char *ip1 = args[0], *op1 = args[1];\
-    npy_intp is1 = steps[0], os1 = steps[1];\
+#define UNARY_LOOP_VECTORIZED(tin, tout)\
+    tin *ip1 = (tin *) args[0];\
+    tout *op1 = (tout *) args[1];		\
     npy_intp n = dimensions[0];\
     npy_intp i;\
     NPY_PRAGMA_VECTOR\
-    for(i = 0; i < n; i++, ip1 += is1, op1 += os1)
+    for(i = 0; i < n; ++i, ++ip1, ++op1)
 
-#define UNARY_LOOP_DISPATCH(cond, body)\
+#define UNARY_LOOP_DISPATCH(tin, tout, cond, body)\
     if (cond) {\
-        UNARY_LOOP_VECTORIZED { body; }\
+        UNARY_LOOP_VECTORIZED(tin, tout) { body; }\
     } else {\
         UNARY_LOOP { body; }\
     }
@@ -93,7 +97,7 @@
     npy_intp is1 = steps[0], os1 = steps[1], os2 = steps[2];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
+    for(i = 0; i < n; ++i, ip1 += is1, op1 += os1, op2 += os2)
 
 /** (ip1, ip2) -> (op1) */
 #define BINARY_LOOP\
@@ -101,7 +105,7 @@
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)
+    for(i = 0; i < n; ++i, ip1 += is1, ip2 += is2, op1 += os1)
 
 /** (ip1, ip2) -> (op1, op2) */
 #define BINARY_LOOP_TWO_OUT\
@@ -109,7 +113,7 @@
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], os2 = steps[3];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
+    for(i = 0; i < n; ++i, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
 
 /** (ip1, ip2, ip3) -> (op1) */
 #define TERNARY_LOOP\
@@ -117,7 +121,7 @@
     npy_intp is1 = steps[0], is2 = steps[1], is3 = steps[2], os1 = steps[3];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
+    for(i = 0; i < n; ++i, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
 
 /** @} */
 
diff --git a/mkl_umath/src/loops_intel.h.src b/mkl_umath/src/loops_intel.h.src
deleted file mode 100644
index c45bab4..0000000
--- a/mkl_umath/src/loops_intel.h.src
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Copyright (c) 2019-2021, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *     * Redistributions of source code must retain the above copyright notice,
- *       this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of Intel Corporation nor the names of its contributors
- *       may be used to endorse or promote products derived from this software
- *       without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _MKL_UMATH_LOOPS_H_
-#define _MKL_UMATH_LOOPS_H_
-
-#include "numpy/ndarraytypes.h"
-
-#include <string.h>
-
-/**begin repeat
- * Float types
- *  #TYPE = FLOAT, DOUBLE#
- */
-
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_invsqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_exp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_exp2(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_expm1(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_erf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_log(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_log2(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_log10(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_log1p(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_cos(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_sin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_tan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arccos(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arcsin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arctan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_cosh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_sinh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_tanh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arccosh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arcsinh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arctanh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_fabs(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_floor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_ceil(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_rint(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_trunc(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_cbrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-/**begin repeat1
- * Arithmetic
- * # kind = equal, not_equal, less, less_equal, greater, greater_equal,
- *        logical_and, logical_or#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite, signbit#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-
-NPY_NO_EXPORT void
-@TYPE@_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-/**begin repeat1
- * #kind = maximum, minimum, fmax, fmin#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_modf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_frexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_ldexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-#define @TYPE@_true_divide @TYPE@_divide
-
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           COMPLEX LOOPS                                 **
- *****************************************************************************
- */
-
-#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
-#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
-#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
-#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
-#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
-#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
-
-/**begin repeat
- * complex types
- * #TYPE = CFLOAT, CDOUBLE#
- */
-
-/**begin repeat1
- * arithmetic
- * #kind = add, subtract#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-
-/**begin repeat1
- * arithmetic
- * #kind = greater, greater_equal, less, less_equal, equal, 
-           not_equal, logical_and, logical_or, logical_xor, logical_not,
-	   isnan, isinf, isfinite#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-/**begin repeat1
- * arithmetic
- * #kind = maximum, minimum, fmax, fmin#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-#define @TYPE@_true_divide @TYPE@_divide
-
-/**end repeat**/
-
-#undef CGE
-#undef CLE
-#undef CGT
-#undef CLT
-#undef CEQ
-#undef CNE
-
-#endif
diff --git a/mkl_umath/src/loops_intel.c.src b/mkl_umath/src/mkl_umath_loops.c.src
similarity index 74%
rename from mkl_umath/src/loops_intel.c.src
rename to mkl_umath/src/mkl_umath_loops.c.src
index 0a199dc..86a62c4 100644
--- a/mkl_umath/src/loops_intel.c.src
+++ b/mkl_umath/src/mkl_umath_loops.c.src
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, Intel Corporation
+ * Copyright (c) 2019-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,10 +29,10 @@
 #include "mkl.h"
 #include <float.h>
 #include <fenv.h>
-#include "mathimf.h"
 #include "Python.h"
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define NP_IMPORT_ARRAY
 
 #include "numpy/npy_common.h"
 #include "numpy/ndarraytypes.h"
@@ -40,9 +40,9 @@
 #include "numpy/ufuncobject.h"
 #include "numpy/npy_math.h"
 #include "blocking_utils.h"
-#include "loops_intel.h"
+#include "mkl_umath_loops.h"
 
-/* Adapated from NumPy's source code. 
+/* Adapated from NumPy's source code.
  * https://github.com/numpy/numpy/blob/main/LICENSE.txt */
 
 /*
@@ -143,17 +143,22 @@
 
 static inline npy_double spacing(npy_double x) {
     if (isinf(x))
-	return ((npy_double) NAN);
+    return ((npy_double) NAN);
     return copysign(nextafter(fabs(x), ((npy_double) INFINITY)), x) - x;
 }
 
 static inline npy_float spacingf(npy_float x) {
     if (isinff(x))
-	return ((npy_float) NAN);
+    return ((npy_float) NAN);
 
     return copysignf(nextafterf(fabsf(x), INFINITY), x) - x;
 }
 
+#if defined(_MSC_VER) && defined(__INTEL_COMPILER)
+extern __inline float __cdecl ldexpf( float _X, int _Y) { 
+    return (float)ldexp(_X, _Y); 
+}
+#endif
 
 /**begin repeat
  * Float types
@@ -223,21 +228,26 @@ divmod@c@(@type@ a, @type@ b, @type@ *modulus)
  *  #scalarf = sqrtf, sqrt#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Sqrt, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Sqrt(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+            @type@, @type@
+            ,
+            can_vectorize
             ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -251,21 +261,26 @@ NPY_NO_EXPORT void
  *  #scalarf = (1.0f)/sqrtf, (1.0)/sqrt#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_invsqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_invsqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@InvSqrt, dimensions[0], @type@, args[0], args[1]);
         /* v@c@InvSqrt(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+            @type@, @type@
+            ,
+            can_vectorize
             ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -280,27 +295,29 @@ NPY_NO_EXPORT void
  *  #scalarf = expf, exp#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_exp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_exp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
     int ignore_fpstatus = 0;
 
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))) {
+    if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         ignore_fpstatus = 1;
         CHUNKED_VML_CALL2(v@c@Exp, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Exp(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+            @type@, @type@
+            ,
+            can_vectorize
             ,
             const @type@ in1 = *(@type@ *)ip1;
-            if(in1 == -NPY_INFINITY@A@){
-                ignore_fpstatus = 1;
-            }
+            ignore_fpstatus |= ((in1 == -NPY_INFINITY@A@) ? 1 : 0);
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+    )
     }
     if(ignore_fpstatus) {
         feclearexcept(FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW | FE_INVALID);
@@ -318,11 +335,17 @@ NPY_NO_EXPORT void
  */
 
 /* TODO: Use VML */
-NPY_NO_EXPORT void
-@TYPE@_exp2(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_exp2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
     UNARY_LOOP_DISPATCH(
-        DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+        @type@, @type@
+        ,
+        can_vectorize
         ,
         const @type@ in1 = *(@type@ *)ip1;
         *(@type@ *)op1 = @scalarf@(in1);
@@ -339,21 +362,25 @@ NPY_NO_EXPORT void
  *  #scalarf = expm1f, expm1#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_expm1(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_expm1(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) {
         CHUNKED_VML_CALL2(v@c@Expm1, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Expm1(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -367,21 +394,26 @@ NPY_NO_EXPORT void
  *  #scalarf = erff, erf#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_erf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_erf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Erf, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Erf(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -395,21 +427,26 @@ NPY_NO_EXPORT void
  *  #scalarf = logf, log#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_log(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_log(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Ln, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Ln(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -424,11 +461,17 @@ NPY_NO_EXPORT void
  */
 
 /* TODO: Use VML */
-NPY_NO_EXPORT void
-@TYPE@_log2(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
     UNARY_LOOP_DISPATCH(
-        DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+        @type@, @type@
+        ,
+        can_vectorize
         ,
         const @type@ in1 = *(@type@ *)ip1;
         *(@type@ *)op1 = @scalarf@(in1);
@@ -445,21 +488,26 @@ NPY_NO_EXPORT void
  *  #scalarf = log10f, log10#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_log10(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_log10(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Log10, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Log10(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -473,21 +521,26 @@ NPY_NO_EXPORT void
  *  #scalarf = log1pf, log1p#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_log1p(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_log1p(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Log1p, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Log1p(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -501,21 +554,26 @@ NPY_NO_EXPORT void
  *  #scalarf = cosf, cos#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_cos(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_cos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Cos, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Cos(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -529,21 +587,26 @@ NPY_NO_EXPORT void
  *  #scalarf = sinf, sin#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_sin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Sin, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Sin(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -557,21 +620,26 @@ NPY_NO_EXPORT void
  *  #scalarf = tanf, tan#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_tan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_tan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Tan, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Tan(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -585,21 +653,26 @@ NPY_NO_EXPORT void
  *  #scalarf = acosf, acos#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arccos(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arccos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Acos, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Acos(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -613,21 +686,26 @@ NPY_NO_EXPORT void
  *  #scalarf = asinf, asin#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arcsin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arcsin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Asin, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Asin(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -641,21 +719,26 @@ NPY_NO_EXPORT void
  *  #scalarf = atanf, atan#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arctan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arctan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Atan, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Atan(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -669,21 +752,26 @@ NPY_NO_EXPORT void
  *  #scalarf = coshf, cosh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_cosh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_cosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Cosh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Cosh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -697,21 +785,26 @@ NPY_NO_EXPORT void
  *  #scalarf = sinhf, sinh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_sinh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Sinh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Sinh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -725,21 +818,26 @@ NPY_NO_EXPORT void
  *  #scalarf = tanhf, tanh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_tanh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_tanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Tanh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Tanh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -753,21 +851,26 @@ NPY_NO_EXPORT void
  *  #scalarf = acoshf, acosh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arccosh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arccosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Acosh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Acosh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -781,21 +884,26 @@ NPY_NO_EXPORT void
  *  #scalarf = asinhf, asinh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arcsinh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arcsinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Asinh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Asinh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -809,21 +917,26 @@ NPY_NO_EXPORT void
  *  #scalarf = atanhf, atanh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arctanh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arctanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Atanh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Atanh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -837,11 +950,17 @@ NPY_NO_EXPORT void
  *  #scalarf = fabsf, fabs#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_fabs(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_fabs(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
     UNARY_LOOP_DISPATCH(
-        DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+        @type@, @type@
+        ,
+        can_vectorize
         ,
         const @type@ in1 = *(@type@ *)ip1;
         *(@type@ *)op1 = @scalarf@(in1);
@@ -858,21 +977,26 @@ NPY_NO_EXPORT void
  *  #scalarf = floorf, floor#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_floor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_floor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(steps[0] == sizeof(@type@) && steps[1] == sizeof(@type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Floor, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Floor(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -886,21 +1010,26 @@ NPY_NO_EXPORT void
  *  #scalarf = ceilf, ceil#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_ceil(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_ceil(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Ceil, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Ceil(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -914,21 +1043,26 @@ NPY_NO_EXPORT void
  *  #scalarf = rintf, rint#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_rint(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_rint(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(steps[0] == sizeof(@type@) && steps[1] == sizeof(@type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Rint, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Rint(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -942,21 +1076,26 @@ NPY_NO_EXPORT void
  *  #scalarf = truncf, trunc#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_trunc(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_trunc(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Trunc, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Trunc(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -970,21 +1109,26 @@ NPY_NO_EXPORT void
  *  #scalarf = cbrtf, cbrt#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_cbrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_cbrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Cbrt, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Cbrt(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -1094,8 +1238,8 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride)
  * # PW = 1#
  * # VML = Add#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_CONT(@type@, @type@)) {
 #if @SUPPORTED_BY_VML@
@@ -1127,19 +1271,19 @@ NPY_NO_EXPORT void
                     @type@ *op1_shifted = op1 + peel;
                     @type@ *ip2_shifted = ip2 + peel;
 
-		    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
-			DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			NPY_PRAGMA_VECTOR
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    } else {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    }
+                    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
+                        DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
+                        NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+                        NPY_PRAGMA_VECTOR
+                        for(j = 0; j < j_max; j++) {
+                            op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+                        }
+                    } else {
+                        NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+                        for(j = 0; j < j_max; j++) {
+                            op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+                        }
+                    }
 
                     i = blocked_end;
                 }
@@ -1262,8 +1406,8 @@ NPY_NO_EXPORT void
  * # PW = 0#
  * # VML = Sub#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_CONT(@type@, @type@)) {
 #if @SUPPORTED_BY_VML@
@@ -1295,19 +1439,19 @@ NPY_NO_EXPORT void
                     @type@ *ip2_shifted = ip2 + peel;
                     @type@ *op1_shifted = op1 + peel;
 
-		    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
-			DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			NPY_PRAGMA_VECTOR
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    } else {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    }
+            if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
+            DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            NPY_PRAGMA_VECTOR
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            } else {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            }
 
                     i = blocked_end;
                 }
@@ -1430,8 +1574,8 @@ NPY_NO_EXPORT void
  * # PW = 0#
  * # VML = Mul#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_CONT(@type@, @type@)) {
 #if @SUPPORTED_BY_VML@
@@ -1463,19 +1607,19 @@ NPY_NO_EXPORT void
                     @type@ *ip2_shifted = ip2 + peel;
                     @type@ *op1_shifted = op1 + peel;
 
-		    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
-			DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			NPY_PRAGMA_VECTOR
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    } else {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    }
+            if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
+            DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            NPY_PRAGMA_VECTOR
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            } else {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            }
 
                     i = blocked_end;
                 }
@@ -1598,8 +1742,8 @@ NPY_NO_EXPORT void
  * # PW = 0#
  * # VML = Div#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_CONT(@type@, @type@)) {
 #if @SUPPORTED_BY_VML@
@@ -1620,37 +1764,37 @@ NPY_NO_EXPORT void
             const npy_intp blocked_end = npy_blocked_end(peel, sizeof(@type@), vsize, n);
             npy_intp i;
 
-	    NPY_PRAGMA_NOVECTOR
+        NPY_PRAGMA_NOVECTOR
             for(i = 0; i < peel; i++) {
                 op1[i] = ip1[i] @OP@ ip2[i];
             }
 
             {
                 npy_intp j, j_max = blocked_end - peel;
-		j_max &= (~0xf);
-		const npy_intp blocked_end = j_max + peel;
+        j_max &= (~0xf);
+        const npy_intp blocked_end = j_max + peel;
                 if (j_max > 0) {
                     @type@ *ip1_aligned = ip1 + peel, *op1_shifted = op1 + peel, *ip2_shifted = ip2 + peel;
 
-		    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
-			DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			NPY_PRAGMA_VECTOR
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    } else {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    }
+            if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
+            DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            NPY_PRAGMA_VECTOR
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            } else {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            }
 
                     i = blocked_end;
                 }
             }
 
-	    NPY_PRAGMA_NOVECTOR
+        NPY_PRAGMA_NOVECTOR
             for(; i < n; i++) {
                 op1[i] = ip1[i] @OP@ ip2[i];
             }
@@ -1666,7 +1810,7 @@ NPY_NO_EXPORT void
         npy_intp i;
 
         const @type@ ip1c = ip1[0];
-	NPY_PRAGMA_NOVECTOR
+    NPY_PRAGMA_NOVECTOR
         for(i = 0; i < peel; i++) {
             op1[i] = ip1c @OP@ ip2[i];
         }
@@ -1685,7 +1829,7 @@ NPY_NO_EXPORT void
             }
         }
 
-	NPY_PRAGMA_NOVECTOR
+    NPY_PRAGMA_NOVECTOR
         for(; i < n; i++) {
             op1[i] = ip1c @OP@ ip2[i];
         }
@@ -1700,7 +1844,7 @@ NPY_NO_EXPORT void
         npy_intp i;
 
         const @type@ ip2c = ip2[0];
-	NPY_PRAGMA_NOVECTOR
+    NPY_PRAGMA_NOVECTOR
         for(i = 0; i < peel; i++) {
             op1[i] = ip1[i] @OP@ ip2c;
         }
@@ -1719,7 +1863,7 @@ NPY_NO_EXPORT void
             }
         }
 
-	NPY_PRAGMA_NOVECTOR
+    NPY_PRAGMA_NOVECTOR
         for(; i < n; i++) {
             op1[i] = ip1[i] @OP@ ip2c;
         }
@@ -1750,8 +1894,8 @@ NPY_NO_EXPORT void
  *        logical_and, logical_or#
  * #OP = ==, !=, <, <=, >, >=, &&, ||#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     {
         BINARY_LOOP {
@@ -1763,8 +1907,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_logical_xor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const int t1 = !!*(@type@ *)ip1;
@@ -1773,8 +1917,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_logical_not(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1786,8 +1930,8 @@ NPY_NO_EXPORT void
  * #kind = isnan, isinf, isfinite, signbit#
  * #func = isnan, isinf, isfinite, signbit#
  **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     {
         UNARY_LOOP {
@@ -1799,8 +1943,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_spacing(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1808,8 +1952,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_copysign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1818,8 +1962,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_nextafter(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1832,8 +1976,8 @@ NPY_NO_EXPORT void
  * #kind = maximum, minimum#
  * #OP =  >=, <=#
  **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /*  */
     if (IS_BINARY_REDUCE) {
@@ -1863,8 +2007,8 @@ NPY_NO_EXPORT void
  * #kind = fmax, fmin#
  * #OP =  >=, <=#
  **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /*  */
     if (IS_BINARY_REDUCE) {
@@ -1887,19 +2031,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        @type@ mod;
-        *((@type@ *)op1) = divmod@c@(in1, in2, &mod);
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_remainder(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1908,8 +2041,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_divmod(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP_TWO_OUT {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1918,8 +2051,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
 #if @SUPPORTED_BY_VML@
     if(IS_UNARY_CONT(@type@, @type@) &&
@@ -1937,8 +2070,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
 #if @SUPPORTED_BY_VML@
     if(IS_UNARY_CONT(@type@, @type@) &&
@@ -1956,16 +2089,16 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         *((@type@ *)op1) = 1;
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1973,8 +2106,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
 #if @SUPPORTED_BY_VML@
     if(IS_UNARY_CONT(@type@, @type@) &&
@@ -1995,8 +2128,8 @@ NPY_NO_EXPORT void
     feclearexcept(FE_ALL_EXCEPT); /* clear floatstatus */
 }
 
-NPY_NO_EXPORT void
-@TYPE@_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_negative(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     {
         UNARY_LOOP {
@@ -2006,8 +2139,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_positive(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -2015,8 +2148,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /* Sign of nan is nan */
     UNARY_LOOP {
@@ -2025,8 +2158,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_modf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_modf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP_TWO_OUT {
         const @type@ in1 = *(@type@ *)ip1;
@@ -2034,8 +2167,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_frexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_frexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP_TWO_OUT {
         const @type@ in1 = *(@type@ *)ip1;
@@ -2043,8 +2176,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_ldexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_ldexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -2053,8 +2186,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_ldexp_long(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /*
      * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
@@ -2083,7 +2216,7 @@ NPY_NO_EXPORT void
     }
 }
 
-#define @TYPE@_true_divide @TYPE@_divide
+#define mkl_umath_@TYPE@_true_divide mkl_umath_@TYPE@_divide
 
 /**end repeat**/
 
@@ -2159,13 +2292,13 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
         for (i = 8; i < n - (n % 8); i += 8) {
             /* small blocksizes seems to mess with hardware prefetch */
             NPY_PREFETCH(a + (i + 512 /(npy_intp)sizeof(@ftype@))*stride, 0, 3);
-	    r[0] += *((@ftype@ *)(a + (i + 0) * stride));
+        r[0] += *((@ftype@ *)(a + (i + 0) * stride));
             r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
-	    r[2] += *((@ftype@ *)(a + (i + 2) * stride));
+        r[2] += *((@ftype@ *)(a + (i + 2) * stride));
             r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
-	    r[4] += *((@ftype@ *)(a + (i + 4) * stride));
+        r[4] += *((@ftype@ *)(a + (i + 4) * stride));
             r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
-	    r[6] += *((@ftype@ *)(a + (i + 6) * stride));
+        r[6] += *((@ftype@ *)(a + (i + 6) * stride));
             r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
         }
 
@@ -2200,8 +2333,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
  * #OP = +, -#
  * #PW = 1, 0#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE && @PW@) {
         npy_intp n = dimensions[0];
@@ -2227,8 +2360,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2240,8 +2373,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2272,33 +2405,12 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-        if (fabs@c@(in2r) >= fabs@c@(in2i)) {
-            const @ftype@ rat = in2i/in2r;
-            ((@ftype@ *)op1)[0] = floor@c@((in1r + in1i*rat)/(in2r + in2i*rat));
-            ((@ftype@ *)op1)[1] = 0;
-        }
-        else {
-            const @ftype@ rat = in2r/in2i;
-            ((@ftype@ *)op1)[0] = floor@c@((in1r*rat + in1i)/(in2i + in2r*rat));
-            ((@ftype@ *)op1)[1] = 0;
-        }
-    }
-}
-
 /**begin repeat1
  * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
  * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2315,8 +2427,8 @@ NPY_NO_EXPORT void
    #OP1 = ||, ||#
    #OP2 = &&, ||#
 */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2328,8 +2440,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_logical_xor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2342,8 +2454,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_logical_not(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2357,8 +2469,8 @@ NPY_NO_EXPORT void
  * #func = isnan, isinf, isfinite#
  * #OP = ||, ||, &&#
  **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2369,8 +2481,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2380,8 +2492,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2400,8 +2512,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         ((@ftype@ *)op1)[0] = 1;
@@ -2409,8 +2521,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) {
+void
+mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
         const @ftype@ in1i = ((@ftype@ *)ip1)[1];
@@ -2419,8 +2531,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     int ignore_fpstatus = 0;
 
@@ -2449,8 +2561,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@__arg(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2459,8 +2571,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /* fixme: sign of nan is currently 0 */
     UNARY_LOOP {
@@ -2478,8 +2590,8 @@ NPY_NO_EXPORT void
  * #kind = maximum, minimum#
  * #OP = CGE, CLE#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2501,8 +2613,8 @@ NPY_NO_EXPORT void
  * #kind = fmax, fmin#
  * #OP = CGE, CLE#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2522,7 +2634,7 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-#define @TYPE@_true_divide @TYPE@_divide
+#define mkl_umath_@TYPE@_true_divide mkl_umath_@TYPE@_divide
 
 /**end repeat**/
 
diff --git a/mkl_umath/src/mkl_umath_loops.h.src b/mkl_umath/src/mkl_umath_loops.h.src
new file mode 100644
index 0000000..c643c20
--- /dev/null
+++ b/mkl_umath/src/mkl_umath_loops.h.src
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2019-2023, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Intel Corporation nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MKL_UMATH_LOOPS_H_
+#define _MKL_UMATH_LOOPS_H_
+
+#include "numpy/ndarraytypes.h"
+
+#include <string.h>
+
+#ifdef _WIN32
+#ifdef mkl_umath_loops_EXPORTS
+#define MKL_UMATH_API __declspec(dllexport)
+#else
+#define MKL_UMATH_API __declspec(dllimport)
+#endif
+#else
+#define MKL_UMATH_API
+#endif
+
+/**begin repeat
+ * Float types
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_sqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_invsqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_exp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_exp2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_expm1(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_erf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_log(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_log10(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_log1p(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_cos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_sin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_tan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_arccos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_arcsin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_arctan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_cosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_sinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_tanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_arccosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_arcsinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_arctanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_fabs(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_floor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_ceil(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_rint(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_trunc(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_cbrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ */
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = equal, not_equal, less, less_equal, greater, greater_equal,
+ *        logical_and, logical_or#
+ */
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_logical_xor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_logical_not(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ **/
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_spacing(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_copysign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_nextafter(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = maximum, minimum, fmax, fmin#
+ **/
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_floor_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_remainder(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_divmod(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_negative(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_positive(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_modf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_frexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_ldexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_ldexp_long(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+#define mkl_umath_@TYPE@_true_divide mkl_umath_@TYPE@_divide
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+
+#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
+#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
+#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
+#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
+
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract#
+ */
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_floor_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+
+/**begin repeat1
+ * arithmetic
+ * #kind = greater, greater_equal, less, less_equal, equal, 
+           not_equal, logical_and, logical_or, logical_xor, logical_not,
+	   isnan, isinf, isfinite#
+ */
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@__arg(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+/**begin repeat1
+ * arithmetic
+ * #kind = maximum, minimum, fmax, fmin#
+ */
+MKL_UMATH_API
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+#define mkl_umath_@TYPE@_true_divide mkl_umath_@TYPE@_divide
+
+/**end repeat**/
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+#endif
diff --git a/mkl_umath/src/ufuncsmodule.h b/mkl_umath/src/ufuncsmodule.h
index 2526763..acb6bbd 100644
--- a/mkl_umath/src/ufuncsmodule.h
+++ b/mkl_umath/src/ufuncsmodule.h
@@ -25,6 +25,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #include "Python.h"
+#define PY_ARRAY_UNIQUE_SYMBOL mkl_umath_ufunc_ext
 #include "numpy/arrayobject.h"
 #include "numpy/ndarraytypes.h"
 #include "numpy/ufuncobject.h"
diff --git a/mkl_umath/tests/test_basic.py b/mkl_umath/tests/test_basic.py
index 14e5ded..88770a9 100644
--- a/mkl_umath/tests/test_basic.py
+++ b/mkl_umath/tests/test_basic.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@@ -23,6 +23,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
 import numpy as np
 import mkl_umath._ufuncs as mu
 import numpy.core.umath as nu
@@ -41,19 +42,16 @@ def get_args(args_str):
         elif s == 'D':
             args.append(np.double(np.random.random_sample()) + np.double(np.random.random_sample()) * 1j)
         elif s == 'i':
-            args.append(np.int(np.random.randint(low=1, high=10)))
+            args.append(np.int_(np.random.randint(low=1, high=10)))
         elif s == 'l':
-            args.append(np.long(np.random.randint(low=1, high=10)))
+            args.append(np.dtype('long').type(np.random.randint(low=1, high=10)))
         else:
             raise ValueError("Unexpected type specified!")
     return tuple(args)
 
 umaths = [i for i in dir(mu) if isinstance(getattr(mu, i), np.ufunc)]
-
 umaths.remove('arccosh') # expects input greater than 1
 
-# dictionary with test cases
-# (umath, types) : args
 generated_cases = {}
 for umath in umaths:
     mkl_umath = getattr(mu, umath)
@@ -64,29 +62,30 @@ def get_args(args_str):
         generated_cases[(umath, type)] = args
 
 additional_cases = {
-('arccosh', 'f->f') : (np.single(np.random.random_sample() + 1),),
-('arccosh', 'd->d') : (np.double(np.random.random_sample() + 1),),
+    ('arccosh', 'f->f'): (np.single(np.random.random_sample() + 1),),
+    ('arccosh', 'd->d'): (np.double(np.random.random_sample() + 1),),
 }
 
-test_cases = {}
-for d in (generated_cases, additional_cases):
-    test_cases.update(d)
+test_cases = {**generated_cases, **additional_cases}
 
-for case in test_cases:
-    umath = case[0]
-    type = case[1]
+@pytest.mark.parametrize("case", list(test_cases.keys()))
+def test_umath(case):
+    umath, type = case
     args = test_cases[case]
     mkl_umath = getattr(mu, umath)
     np_umath = getattr(nu, umath)
     print('*'*80)
-    print(umath, type)
-    print("args", args)
+    print(f"Testing {umath} with type {type}")
+    print("args:", args)
+    
     mkl_res = mkl_umath(*args)
     np_res = np_umath(*args)
-    print("mkl res", mkl_res)
-    print("npy res", np_res)
-
-    assert np.array_equal(mkl_res, np_res)
+    
+    print("mkl res:", mkl_res)
+    print("npy res:", np_res)
+    
+    assert np.allclose(mkl_res, np_res), f"Results for {umath} do not match"
 
-print("Test cases count:", len(test_cases))
-print("All looks good!")
+def test_cases_count():
+    print("Test cases count:", len(test_cases))
+    assert len(test_cases) > 0, "No test cases found"
diff --git a/mkl_umath/ufunc_docstrings.py b/mkl_umath/ufunc_docstrings.py
index 5abc3af..79877e2 100644
--- a/mkl_umath/ufunc_docstrings.py
+++ b/mkl_umath/ufunc_docstrings.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
diff --git a/setup.py b/setup.py
index 1ab571e..0ee7fa6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@@ -24,8 +24,23 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import importlib.machinery
 import io
+import os
 import re
+from distutils.dep_util import newer
+from _vendored.conv_template import process_file as process_c_file
+from os import (getcwd, environ, makedirs)
+from os.path import join, exists, abspath, dirname
+from setuptools import Extension
+
+import skbuild
+import skbuild.setuptools_wrap
+import skbuild.utils
+from skbuild.command.build_py import build_py as _skbuild_build_py
+from skbuild.command.install import install as _skbuild_install
+
+# import versioneer
 
 with io.open('mkl_umath/_version.py', 'rt', encoding='utf8') as f:
     version = re.search(r'__version__ = \'(.*?)\'', f.read()).group(1)
@@ -54,46 +69,86 @@
 Operating System :: MacOS
 """
 
-def configuration(parent_package='',top_path=None):
-    from numpy.distutils.misc_util import Configuration
-
-    config = Configuration(None, parent_package, top_path)
-    config.set_options(ignore_setup_xxx_py=True,
-                       assume_default_configuration=True,
-                       delegate_options_to_subpackages=True,
-                       quiet=True)
-
-    config.add_subpackage('mkl_umath')
-
-    config.version = VERSION
-
-    return config
-
-
-def setup_package():
-    from setuptools import setup
-    from numpy.distutils.core import setup
-    metadata = dict(
-        name = 'mkl_umath',
-        maintainer = "Intel Corp.",
-        maintainer_email = "scripting@intel.com",
-        description = "MKL-based universal functions for NumPy arrays",
-        long_description = long_description,
-        long_description_content_type="text/markdown",
-        url = "http://github.com/IntelPython/mkl_umath",
-        author = "Intel Corporation",
-        download_url = "http://github.com/IntelPython/mkl_umath",
-        license = 'BSD',
-        classifiers = [_f for _f in CLASSIFIERS.split('\n') if _f],
-        platforms = ["Windows", "Linux", "Mac OS-X"],
-        test_suite = 'nose.collector',
-        python_requires = '>=3.6',
-        install_requires = ['numpy'],
-        configuration = configuration
-    )
-    setup(**metadata)
-
-    return None
-
-if __name__ == '__main__':
-    setup_package()
+
+def load_module(name, fn):
+    """
+    Credit: numpy.compat.npy_load_module
+    """
+    return importlib.machinery.SourceFileLoader(name, fn).load_module()
+
+def separator_join(sep, strs):
+    """
+    Joins non-empty arguments strings with dot.
+
+    Credit: numpy.distutils.misc_util.dot_join
+    """
+    assert isinstance(strs, (list, tuple))
+    assert isinstance(sep, str)
+    return sep.join([si for si in strs if si])
+
+pdir = join(dirname(__file__), 'mkl_umath')
+wdir = join(pdir, 'src')
+
+generate_umath_py = join(pdir, 'generate_umath.py')
+n = separator_join('_', ('mkl_umath', 'generate_umath'))
+generate_umath = load_module(n, generate_umath_py)
+del n
+
+
+def generate_umath_c(build_dir):
+    target_dir = join(build_dir, 'src')
+    target = join(target_dir, '__umath_generated.c')
+    if not exists(target_dir):
+        print("Folder {} was expected to exist, but creating".format(target_dir))
+        makedirs(target_dir)
+    script = generate_umath_py
+    if newer(script, target):
+        with open(target, 'w') as f:
+            f.write(generate_umath.make_code(generate_umath.defdict,
+                                             generate_umath.__file__))
+    return []
+
+
+generate_umath_c(pdir)
+
+loops_header_templ = join(wdir, "mkl_umath_loops.h.src")
+processed_loops_h_fn = join(wdir, "mkl_umath_loops.h")
+loops_header_processed = process_c_file(loops_header_templ)
+
+with open(processed_loops_h_fn, 'w') as fid:
+    fid.write(loops_header_processed)
+
+loops_src_templ = join(wdir, "mkl_umath_loops.c.src")
+processed_loops_src_fn = join(wdir, "mkl_umath_loops.c")
+loops_src_processed = process_c_file(loops_src_templ)
+
+with open(processed_loops_src_fn, 'w') as fid:
+    fid.write(loops_src_processed)
+
+
+skbuild.setup(
+    name="mkl_umath",
+    version=VERSION,
+    maintainer = "Intel Corp.",
+    maintainer_email = "scripting@intel.com",
+    description = "MKL-based universal functions for NumPy arrays",
+    long_description = long_description,
+    long_description_content_type="text/markdown",
+    license = 'BSD',
+    author="Intel Corporation",
+    url="http://github.com/IntelPython/mkl_umath",
+    download_url="http://github.com/IntelPython/mkl_umath",
+    packages=[
+        "mkl_umath",
+    ],
+    package_data={"mkl_umath": ["tests/*.*"]},
+    include_package_data=True,
+    zip_safe=False,
+    setup_requires=["Cython"],
+    install_requires=[
+        "numpy",
+    ],
+    keywords="mkl_umath",
+    classifiers=[_f for _f in CLASSIFIERS.split("\n") if _f],
+    platforms=["Linux", "Windows"]
+)