diff --git a/.github/workflows/c-api-linux.yaml b/.github/workflows/c-api-linux.yaml new file mode 100644 index 0000000..bec68e0 --- /dev/null +++ b/.github/workflows/c-api-linux.yaml @@ -0,0 +1,51 @@ +name: Build C-API shared library with g++ for Linux + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build-linux: + runs-on: ubuntu-latest + env: + NAME: x86simdsort + + steps: + - uses: actions/checkout@v4 + + - name: Define variables + run: | + echo "LIBNAME=${NAME}-c-api" >> $GITHUB_ENV + echo "FOLDER=${NAME}-${GITHUB_SHA::8}" >> $GITHUB_ENV + + - name: Install g++ + run: | + sudo apt-get update + sudo apt-get install -y g++ + + - name: Build (Linux) + run: | + cd make-c-api + make all -j2 + + - name: Run executable (Linux sanity check) + run: | + cd make-c-api + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./ ./smoke.exe + + - name: Package (Linux) + run: | + mkdir -p artifacts/${NAME} + cp make-c-api/*.{so,h} artifacts/${NAME}/ + cd artifacts + tar -czvf ${NAME}-${GITHUB_SHA::8}.tgz ${NAME} + + - name: Upload artifacts (Linux) + uses: actions/upload-artifact@v4 + with: + name: c-api-linux-build + path: artifacts/*.tgz + + diff --git a/.github/workflows/c-api-win-msvc.yaml b/.github/workflows/c-api-win-msvc.yaml new file mode 100644 index 0000000..fec8925 --- /dev/null +++ b/.github/workflows/c-api-win-msvc.yaml @@ -0,0 +1,71 @@ +name: MSYS2 DLL + MSVC EXE + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build-windows-mingw: + runs-on: windows-latest + env: + NAME: x86simdsort + + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Define variables + run: | + echo "LIBNAME=${NAME}-c-api" >> $GITHUB_ENV + echo "FOLDER=${NAME}-${GITHUB_SHA::8}" >> $GITHUB_ENV + + # Build DLL with MSYS2 + - name: Install MSYS2 + uses: msys2/setup-msys2@v2 + with: + msystem: MINGW64 + install: >- + mingw-w64-x86_64-toolchain + mingw-w64-x86_64-clang + make + zip + + - name: Build DLL (MinGW/Clang) + shell: msys2 {0} + run: | + cd make-c-api + make all -j2 TARGET=DLL + + - name: Package (Windows) + shell: msys2 {0} + run: | + mkdir -p artifacts/${NAME} + cp make-c-api/*.{lib,dll,h} artifacts/${NAME}/ + cd artifacts + zip -r ${NAME}-${GITHUB_SHA::8}.zip ${NAME} + + # --- Build EXE test with MSVC --- + - name: Configure MSVC environment + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 # Or other architectures like x86, amd64_x86, etc. + + - name: Build EXE with MSVC + run: | + cd make-c-api + cl /EHsc /O2 /arch:AVX2 /std:c++20 /GL /Gy /MD /nologo /DNDEBUG smoke.cpp x86simdsort-c-api.dll.lib /Fe:smoke.exe + + # --- Run test --- + - name: Run EXE + shell: cmd + run: | + cd make-c-api + .\smoke.exe + + - name: Upload artifacts (Windows) + uses: actions/upload-artifact@v4 + with: + name: c-api-windows-build + path: artifacts/*.zip diff --git a/.github/workflows/c-api-win.yaml b/.github/workflows/c-api-win.yaml new file mode 100644 index 0000000..d486b50 --- /dev/null +++ b/.github/workflows/c-api-win.yaml @@ -0,0 +1,45 @@ +name: Build C-API shared library with mingw-64-clang++ for Windows + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build-windows-mingw: + runs-on: ubuntu-latest + env: + NAME: x86simdsort + + steps: + - uses: actions/checkout@v4 + + - name: Define variables + run: | + echo "LIBNAME=${NAME}-c-api" >> $GITHUB_ENV + echo "FOLDER=${NAME}-${GITHUB_SHA::8}" >> $GITHUB_ENV + + - name: Install MinGW-w64 + run: | + sudo apt-get update + sudo apt install -y mingw-w64 + sudo apt install -y clang + + - name: Build (Windows cross-compile DLL) + run: | + cd make-c-api + make all -j2 TARGET=DLL + + - name: Package (Windows) + run: | + mkdir -p artifacts/${NAME} + cp make-c-api/*.{lib,dll,h} artifacts/${NAME}/ + cd artifacts + zip -r ${NAME}-${GITHUB_SHA::8}.zip ${NAME} + + - name: Upload artifacts (Windows) + uses: actions/upload-artifact@v4 + with: + name: c-api-windows-build + path: artifacts/*.zip diff --git a/README.md b/README.md index 20a46f8..7b4baf4 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,37 @@ xss_dep = xss.get_variable('x86simdsortcpp_dep') For more detailed instructions please refer to Meson [documentation](https://mesonbuild.com/Subprojects.html#using-a-subproject). +## Build with C-only API, targeting Windows and MSVC + +The folder `make-c-api` contains a `Makefile` to build a shared library exporting only C-symbols. +This avoid potential conflicts of inconsistent mangling and calling convention across compilers. + +Only a reduced set of functions are exported (`qselect`, `qsort`, `partial_qsort`, `keyvalue_qsort`, `keyvalue_partial_qsort`, `keyvalue_qselect`) +and only for a reduces set of types (`int32`, `uint32`, `int64`, `uint64`, `float`, `double`). + +In the subfolder `make-c-api` a `Makefile` is provided which supports the following combinations of operating systems, compilers and targets: + +``` ++--------------+------------------+------------+----------------------+---------------------+ +| Compilation | Compiler | Target | Files Generated | Build Command line | +| Platform | | Platform | | | ++--------------+------------------+------------+----------------------+---------------------+ +| LINUX | g++ | LINUX | libx86simdsort.so | make | ++--------------+------------------+------------+----------------------+---------------------+ +| CYGWIN | clang++ | CYGWIN | libx86simdsort.so | make | ++--------------+------------------+------------+----------------------+---------------------+ +| ANY | mingw32 clang++ | Windows | x86simdsort.dll | make TARGET=DLL | +| | | | x86simdsort.dll.lib | | ++--------------+------------------+------------+----------------------+---------------------+ +``` + +In all cases, exceptions and memory allocations never cross the shared library boundary. +In particular, if the target is a DLL, the library is self-contained, i.e. it does not depend on any other DLL. + +Compilation is tested with clang 20 and gcc 15. + +An include header `x86simdsort-c-api.h` and a `smoke.exe` executable test are also generated. + ## Example usage #### Sort an array of floats diff --git a/lib/c-api-def.h b/lib/c-api-def.h new file mode 100644 index 0000000..d8dca5f --- /dev/null +++ b/lib/c-api-def.h @@ -0,0 +1,29 @@ +#include "c-api-headers.h" + +LIBRARY x86simdsortcpp +EXPORTS + +#define XSS_XI1(n,t) XSS_C_EXP_NAME1(qsort,n) +#define XSS_XF1(n,t) XSS_C_EXP_NAME1(qsort,n) +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_C_EXP_NAME1(qselect,n) +#define XSS_XF1(n,t) XSS_C_EXP_NAME1(qselect,n) +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_C_EXP_NAME1(partial_qsort,n) +#define XSS_XF1(n,t) XSS_C_EXP_NAME1(partial_qsort,n) +#include "c-api-x-macro1.h" + + +#define XSS_XI2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_qsort, n1, n2) +#define XSS_XF2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_qsort, n1, n2) +#include "c-api-x-macro2.h" + +#define XSS_XI2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_qselect, n1, n2) +#define XSS_XF2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_qselect, n1, n2) +#include "c-api-x-macro2.h" + +#define XSS_XI2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_partial_qsort, n1, n2) +#define XSS_XF2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_partial_qsort, n1, n2) +#include "c-api-x-macro2.h" diff --git a/lib/c-api-export.h b/lib/c-api-export.h new file mode 100644 index 0000000..014cc96 --- /dev/null +++ b/lib/c-api-export.h @@ -0,0 +1,37 @@ +#include "c-api-headers.h" + +#ifdef XSS_EXPORTING +# if defined(__MINGW64__) +# define XSS_C_EXPORT __declspec(dllexport) +# else +# define XSS_C_EXPORT __attribute__((visibility("default"))) +# endif +# define XSS_C_BODY(body) { try { body; return true; } catch(...) { return false; } } +#else +# define XSS_C_EXPORT XSS_DLL_IMPORT +# define XSS_C_BODY(body) ; +#endif + +#define XSS_XI1(n,t) XSS_C_EXPORT XSS_QSORT_HEADER_INT(n,t) XSS_C_BODY(x86simdsort::qsort(ar, size, false, descending)) +#define XSS_XF1(n,t) XSS_C_EXPORT XSS_QSORT_HEADER_FLT(n,t) XSS_C_BODY(x86simdsort::qsort(ar, size, hasnan, descending)) +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_C_EXPORT XSS_QSELECT_HEADER_INT(n,t) XSS_C_BODY(x86simdsort::qselect(ar, k, size, false, descending)) +#define XSS_XF1(n,t) XSS_C_EXPORT XSS_QSELECT_HEADER_FLT(n,t) XSS_C_BODY(x86simdsort::qselect(ar, k, size, hasnan, descending)) +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_C_EXPORT XSS_QPSORT_HEADER_INT(n,t) XSS_C_BODY(x86simdsort::partial_qsort(ar, k, size, false, descending)) +#define XSS_XF1(n,t) XSS_C_EXPORT XSS_QPSORT_HEADER_FLT(n,t) XSS_C_BODY(x86simdsort::partial_qsort(ar, k, size, hasnan, descending)) +#include "c-api-x-macro1.h" + +#define XSS_XI2(n1,t1, n2,t2) XSS_C_EXPORT XSS_QKVSORT_HEADER_INT(n1,t1,n2,t2) XSS_C_BODY(x86simdsort::keyvalue_qsort(keys, vals, size, false, descending)) +#define XSS_XF2(n1,t1, n2,t2) XSS_C_EXPORT XSS_QKVSORT_HEADER_FLT(n1,t1,n2,t2) XSS_C_BODY(x86simdsort::keyvalue_qsort(keys, vals, size, hasnan, descending)) +#include "c-api-x-macro2.h" + +#define XSS_XI2(n1,t1, n2,t2) XSS_C_EXPORT XSS_QKVSEL_HEADER_INT(n1,t1,n2,t2) XSS_C_BODY(x86simdsort::keyvalue_select(keys, vals, k, size, false, descending)) +#define XSS_XF2(n1,t1, n2,t2) XSS_C_EXPORT XSS_QKVSEL_HEADER_FLT(n1,t1,n2,t2) XSS_C_BODY(x86simdsort::keyvalue_select(keys, vals, k, size, hasnan, descending)) +#include "c-api-x-macro2.h" + +#define XSS_XI2(n1,t1, n2,t2) XSS_C_EXPORT XSS_QKVPSORT_HEADER_INT(n1,t1,n2,t2) XSS_C_BODY(x86simdsort::keyvalue_partial_sort(keys, vals, k, size, false, descending)) +#define XSS_XF2(n1,t1, n2,t2) XSS_C_EXPORT XSS_QKVPSORT_HEADER_FLT(n1,t1,n2,t2) XSS_C_BODY(x86simdsort::keyvalue_partial_sort(keys, vals, k, size, hasnan, descending)) +#include "c-api-x-macro2.h" diff --git a/lib/c-api-header-gen.h b/lib/c-api-header-gen.h new file mode 100644 index 0000000..1a2c307 --- /dev/null +++ b/lib/c-api-header-gen.h @@ -0,0 +1,60 @@ +COMMENT This is an auto-generated file +COMMENT This header is intended to be used for shared libraries (so or dll) compiled to the shared_c_api target + +PRAGMA ifndef __X86_SIMD_SORT_C_API_H__ +PRAGMA define __X86_SIMD_SORT_C_API_H__ + +EMPTYLINE +PRAGMA include +EMPTYLINE + +PRAGMA define XSS_DLL_IMPORT + +#include "c-api-headers.h" + +EMPTYLINE +COMMENT DLL import declarations + +PRAGMA ifdef __cplusplus +extern "C" { +PRAGMA endif +#include "c-api-export.h" +PRAGMA ifdef __cplusplus +} COMMENT extern "C" +PRAGMA endif + +EMPTYLINE +PRAGMA ifdef __cplusplus +COMMENT C++ overloaded dispatchers +namespace xss +{ + +#define XSS_XI1(n,t) inline bool qsort(t* ar, uint64_t sz, bool desc) { return XSS_C_EXP_NAME1(qsort,n)( ar, sz, desc ); } +#define XSS_XF1(n,t) inline bool qsort(t* ar, uint64_t sz, bool hasnan, bool desc) { return XSS_C_EXP_NAME1(qsort,n)( ar, sz, hasnan, desc ); } +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) inline bool qselect(t* ar, uint64_t k, uint64_t sz, bool desc) { return XSS_C_EXP_NAME1(qselect,n)( ar, k, sz, desc ); } +#define XSS_XF1(n,t) inline bool qselect(t* ar, uint64_t k, uint64_t sz, bool hasnan, bool desc) { return XSS_C_EXP_NAME1(qselect,n)( ar, k, sz, hasnan, desc ); } +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) inline bool partial_qsort(t* ar, uint64_t k, uint64_t sz, bool desc) { return XSS_C_EXP_NAME1(partial_qsort,n)( ar, k, sz, desc ); } +#define XSS_XF1(n,t) inline bool partial_qsort(t* ar, uint64_t k, uint64_t sz, bool hasnan, bool desc) { return XSS_C_EXP_NAME1(partial_qsort,n)( ar, k, sz, hasnan, desc ); } +#include "c-api-x-macro1.h" + +#define XSS_XI2(n1,t1, n2,t2) inline bool keyvalue_qsort(t1* keys, t2* vals, uint64_t sz, bool desc) { return XSS_C_EXP_NAME2(keyvalue_qsort,n1, n2)( keys, vals, sz, desc ); } +#define XSS_XF2(n1,t1, n2,t2) inline bool keyvalue_qsort(t1* keys, t2* vals, uint64_t sz, bool hasnan, bool desc) { return XSS_C_EXP_NAME2(keyvalue_qsort,n1, n2)( keys, vals, sz, hasnan, desc ); } +#include "c-api-x-macro2.h" + +#define XSS_XI2(n1,t1, n2,t2) inline bool keyvalue_qselect(t1* keys, t2* vals, uint64_t k, uint64_t sz, bool desc) { return XSS_C_EXP_NAME2(keyvalue_qselect,n1, n2)( keys, vals, k, sz, desc ); } +#define XSS_XF2(n1,t1, n2,t2) inline bool keyvalue_qselect(t1* keys, t2* vals, uint64_t k, uint64_t sz, bool hasnan, bool desc) { return XSS_C_EXP_NAME2(keyvalue_qselect,n1, n2)( keys, vals, k, sz, hasnan, desc ); } +#include "c-api-x-macro2.h" + +#define XSS_XI2(n1,t1, n2,t2) inline bool keyvalue_partial_qsort(t1* keys, t2* vals, uint64_t k, uint64_t sz, bool desc) { return XSS_C_EXP_NAME2(keyvalue_partial_qsort,n1, n2)( keys, vals, k, sz, desc ); } +#define XSS_XF2(n1,t1, n2,t2) inline bool keyvalue_partial_qsort(t1* keys, t2* vals, uint64_t k, uint64_t sz, bool hasnan, bool desc) { return XSS_C_EXP_NAME2(keyvalue_partial_qsort,n1, n2)( keys, vals, k, sz, hasnan, desc ); } +#include "c-api-x-macro2.h" + +} COMMENT namespace xss + +PRAGMA endif COMMENT ifdef __cplusplus +EMPTYLINE +PRAGMA endif COMMENT ifndef __X86_SIMD_SORT_C_API_H__ diff --git a/lib/c-api-headers.h b/lib/c-api-headers.h new file mode 100644 index 0000000..6ad7af2 --- /dev/null +++ b/lib/c-api-headers.h @@ -0,0 +1,25 @@ +#pragma once + +#define XSS_C_EXP_NAME1(name, ty) c_xss_##name##_##ty +#define XSS_C_EXP_NAME2(name, ty1, ty2) c_xss_##name##_##ty1##_##ty2 + +#define XSS_QSORT_HEADER_INT(n,t) bool XSS_C_EXP_NAME1(qsort, n)(t *ar, uint64_t size, bool descending) +#define XSS_QSORT_HEADER_FLT(n,t) bool XSS_C_EXP_NAME1(qsort, n)(t *ar, uint64_t size, bool hasnan, bool descending) + +#define XSS_QSELECT_HEADER_INT(n,t) bool XSS_C_EXP_NAME1(qselect, n)(t *ar, uint64_t k, uint64_t size, bool descending) +#define XSS_QSELECT_HEADER_FLT(n,t) bool XSS_C_EXP_NAME1(qselect, n)(t *ar, uint64_t k, uint64_t size, bool hasnan, bool descending) + +#define XSS_QPSORT_HEADER_INT(n,t) bool XSS_C_EXP_NAME1(partial_qsort, n)(t *ar, uint64_t k, uint64_t size, bool descending) +#define XSS_QPSORT_HEADER_FLT(n,t) bool XSS_C_EXP_NAME1(partial_qsort, n)(t *ar, uint64_t k, uint64_t size, bool hasnan, bool descending) + +#define XSS_QKVSORT_HEADER_INT(n1,t1,n2,t2) bool XSS_C_EXP_NAME2(keyvalue_qsort, n1, n2)(t1 *keys, t2* vals, uint64_t size, bool descending) +#define XSS_QKVSORT_HEADER_FLT(n1,t1,n2,t2) bool XSS_C_EXP_NAME2(keyvalue_qsort, n1, n2)(t1 *keys, t2* vals, uint64_t size, bool hasnan, bool descending) + +#define XSS_QKVSEL_HEADER_INT(n1,t1,n2,t2) bool XSS_C_EXP_NAME2(keyvalue_qselect, n1, n2)(t1 *keys, t2* vals, uint64_t k, uint64_t size, bool descending) +#define XSS_QKVSEL_HEADER_FLT(n1,t1,n2,t2) bool XSS_C_EXP_NAME2(keyvalue_qselect, n1, n2)(t1 *keys, t2* vals, uint64_t k, uint64_t size, bool hasnan, bool descending) + +#define XSS_QKVPSORT_HEADER_INT(n1,t1,n2,t2) bool XSS_C_EXP_NAME2(keyvalue_partial_qsort, n1, n2)(t1 *keys, t2* vals, uint64_t k, uint64_t size, bool descending) +#define XSS_QKVPSORT_HEADER_FLT(n1,t1,n2,t2) bool XSS_C_EXP_NAME2(keyvalue_partial_qsort, n1, n2)(t1 *keys, t2* vals, uint64_t k, uint64_t size, bool hasnan, bool descending) + + + diff --git a/lib/c-api-ver.h b/lib/c-api-ver.h new file mode 100644 index 0000000..b87b5a1 --- /dev/null +++ b/lib/c-api-ver.h @@ -0,0 +1,33 @@ +#include "c-api-headers.h" + +{ + global: + +#define XSS_XI1(n,t) XSS_C_EXP_NAME1(qsort,n); +#define XSS_XF1(n,t) XSS_C_EXP_NAME1(qsort,n); +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_C_EXP_NAME1(qselect,n); +#define XSS_XF1(n,t) XSS_C_EXP_NAME1(qselect,n); +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_C_EXP_NAME1(partial_qsort,n); +#define XSS_XF1(n,t) XSS_C_EXP_NAME1(partial_qsort,n); +#include "c-api-x-macro1.h" + + +#define XSS_XI2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_qsort, n1, n2); +#define XSS_XF2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_qsort, n1, n2); +#include "c-api-x-macro2.h" + +#define XSS_XI2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_qselect, n1, n2); +#define XSS_XF2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_qselect, n1, n2); +#include "c-api-x-macro2.h" + +#define XSS_XI2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_partial_qsort, n1, n2); +#define XSS_XF2(n1,t1, n2,t2) XSS_C_EXP_NAME2(keyvalue_partial_qsort, n1, n2); +#include "c-api-x-macro2.h" + + local: + *; +}; diff --git a/lib/c-api-x-macro1.h b/lib/c-api-x-macro1.h new file mode 100644 index 0000000..ccc2d80 --- /dev/null +++ b/lib/c-api-x-macro1.h @@ -0,0 +1,18 @@ + +#ifndef XSS_XI1 +# error XSS_XI1 must be defined +#endif + +#ifndef XSS_XF1 +# error XSS_XF1 must be defined +#endif + +XSS_XI1(uint32, uint32_t) +XSS_XI1(uint64, uint64_t) +XSS_XI1(int64, int64_t) +XSS_XI1(int32, int32_t) +XSS_XF1(float, float) +XSS_XF1(double, double) + +#undef XSS_XI1 +#undef XSS_XF1 diff --git a/lib/c-api-x-macro2.h b/lib/c-api-x-macro2.h new file mode 100644 index 0000000..ad10608 --- /dev/null +++ b/lib/c-api-x-macro2.h @@ -0,0 +1,37 @@ +#ifndef XSS_XI2 +# error XSS_XI3 must be defined +#endif + +#ifndef XSS_XF2 +# error XSS_XF2 must be defined +#endif + +#define XSS_XI1(n,t) XSS_XI2(int32, int32_t, n, t) +#define XSS_XF1(n,t) XSS_XI1(n, t) +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_XI2(uint32, uint32_t, n, t) +#define XSS_XF1(n,t) XSS_XI1(n, t) +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_XI2(int64, int64_t, n, t) +#define XSS_XF1(n,t) XSS_XI1(n, t) +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_XI2(uint64, uint64_t, n, t) +#define XSS_XF1(n,t) XSS_XI1(n, t) +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_XF2(float, float, n, t) +#define XSS_XF1(n,t) XSS_XI1(n, t) +#include "c-api-x-macro1.h" + +#define XSS_XI1(n,t) XSS_XF2(double, double, n, t) +#define XSS_XF1(n,t) XSS_XI1(n, t) +#include "c-api-x-macro1.h" + +#undef XSS_XI2 +#undef XSS_XF2 + + + diff --git a/lib/c-api.sed b/lib/c-api.sed new file mode 100644 index 0000000..e5e06ce --- /dev/null +++ b/lib/c-api.sed @@ -0,0 +1,7 @@ +#!/bin/sh + +s|PRAGMA \(.*\)|#\1|g +s|COMMENT\(.*\)|//\1|g +s/EMPTYLINE//g +s/^XSS_DLL_IMPORT/ XSS_DLL_IMPORT/g +s/^inline/ inline/g diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 8ef9aad..b835b6b 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -296,6 +296,9 @@ DISPATCH_KEYVALUE_SORT_FORTYPE(float) // extern "C" { + +#ifndef C_API_ONLY + XSS_EXPORT_SYMBOL void keyvalue_qsort_float_uint32(float *key, uint32_t *val, size_t size) { @@ -336,4 +339,12 @@ void keyvalue_qsort_uint32_uint64(uint32_t *key, uint64_t *val, size_t size) { x86simdsort::keyvalue_qsort(key, val, size, true); } -} + +#else // C_API_ONLY is defined + +#define XSS_EXPORTING +#include "c-api-export.h" + +#endif // C_API_ONLY + +} // extern "C" diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h index 2e47b6a..7f5e687 100644 --- a/lib/x86simdsort.h +++ b/lib/x86simdsort.h @@ -6,7 +6,11 @@ #include #include -#define XSS_EXPORT_SYMBOL __attribute__((visibility("default"))) +#ifndef C_API_ONLY +# define XSS_EXPORT_SYMBOL __attribute__((visibility("default"))) +#else +# define XSS_EXPORT_SYMBOL __attribute__((visibility("hidden"))) +#endif #define XSS_HIDE_SYMBOL __attribute__((visibility("hidden"))) #define UNUSED(x) (void)(x) diff --git a/make-c-api/.gitignore b/make-c-api/.gitignore new file mode 100644 index 0000000..dd39c0b --- /dev/null +++ b/make-c-api/.gitignore @@ -0,0 +1,5 @@ +smoke.sln +smoke.vcxproj.* +smoke/ +x64/ +x86simdsort-c-api. diff --git a/make-c-api/Makefile b/make-c-api/Makefile new file mode 100644 index 0000000..8f83bd1 --- /dev/null +++ b/make-c-api/Makefile @@ -0,0 +1,155 @@ +# Arguments +# TARGET: SO | DLL (default: SO) +# DEBUG: 0 | 1 (default: 0) + +TARGET ?= LINUX + +LDFLAGS := +CXXFLAGS := -Wall -std=c++20 -fdiagnostics-color=always + +# Debug/release selection +ifeq ($(DEBUG),1) + CXXFLAGS += -g -O0 -DDEBUG -D_GLIBCXX_ASSERTIONS=1 -D_GLIBCXX_DEBUG +else + CXXFLAGS += -O3 +endif + +CXXLIBFLAGS := -I../lib -I../src -I../utils -fvisibility=hidden -fvisibility-inlines-hidden + +NAME := x86simdsort + +UNAME := $(shell uname) +$(info UNAME: $(UNAME)) + +# Detect host system +ifneq (,$(filter CYGWIN%,$(UNAME))) + HOST := CYGWIN +else ifneq (,$(filter MINGW64%,$(UNAME))) + HOST := MINGW +else ifneq (,$(filter Linux%,$(UNAME))) + HOST := LINUX +else + $(error Unknown host system: $(UNAME)) +endif +$(info Host: $(HOST)) + +ifneq (,$(findstring DLL,$(TARGET))) + SHARED_EXT := dll + LIBNAME := $(NAME)-c-api + DLLLINK := $(LIBNAME).$(SHARED_EXT).lib + + # We prefer CLANG, because there ia a bug in gcc when the target is the Windows ABI + # A function call passing by value AVX registers segfaults + # https://github.com/mingw-w64/mingw-w64/issues/115 + CXX = clang++ + LDFLAGS += -shared -Wl,--out-implib,$(DLLLINK) + + ifeq ($(TARGET),DLL) + $(info Building a DLL with MinGW) + CXX += --target=x86_64-w64-mingw32 + STATIC = -static-libstdc++ -static-libgcc + ifeq ($(HOST),CYGWIN) + LDFLAGS += -static -pthread + else ifeq ($(HOST),MINGW) + LDFLAGS += -static -lpthread + else ifeq ($(HOST),LINUX) + LDFLAGS += -static -pthread + else + $(error DLL cross compilation supported on CYGWIN (with x86_64-w64-mingw32-clang++), MINGW64 (with clang++ and mingw) and Linux (with clang++ and mingw)) + endif + else + $(error Invalid target for DLL compilation: $(TARGET)) + endif +else + ifeq ($(HOST),CYGWIN) + # We prefer CLANG, because there ia a bug in gcc when the target is the Windows ABI + # A function call passing by value AVX registers segfaults + # https://github.com/mingw-w64/mingw-w64/issues/115 + CXX := clang++ + else ifeq ($(HOST),LINUX) + CXX := g++ + else + $(error Invalid host system for SO compilation: $(HOST)) + endif + SHARED_EXT := so + LIBNAME := lib$(NAME)-c-api + CXXLIBFLAGS += -fPIC + VERDEP := $(LIBNAME).ver + LDFLAGS += -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -shared -fPIC \ + -Wl,-soname,$(LIBNAME).so -Wl,--version-script=$(LIBNAME).ver + SONAME_FLAG := -Wl,-soname,$(LIBNAME).so + DLLLINK := $(LIBNAME).$(SHARED_EXT) +endif + +$(info CXX: $(CXX)) +$(info Compiler: $(shell $(CXX) --version | head -n1)) + +# Check if configuration changed, if so, clean +CONFIGFILE := config.cfg +CONFIG := $(CXX)-$(TARGET)-$(if $(DEBUG),DEBUG,RELEASE) +$(info CONFIG: $(CONFIG)) +# Configuration file generation +$(shell echo "$(CONFIG)" | cmp -s - $(CONFIGFILE) 2>/dev/null || echo "$(CONFIG)" > $(CONFIGFILE)) + + +# Source and header files +SRC_HEADERS := $(wildcard ../src/*.h) $(wildcard ../src/*.hpp) +UTIL_HEADERS := $(wildcard ../utils/*.h) $(wildcard ../utils/*.hpp) +LIB_HEADERS := $(wildcard ../lib/*.h) $(wildcard ../lib/*.hpp) +ALL_HEADERS := $(UTIL_HEADERS) $(SRC_HEADERS) $(LIB_HEADERS) + +LIB_SOURCES := $(wildcard ../lib/$(NAME)-*.cpp) +LIB_OBJECTS := $(LIB_SOURCES:../lib/$(NAME)-%.cpp=$(NAME)-%.cpp.o) + +EXTRADEPS := Makefile $(CONFIGFILE) + +.PHONY: clean default all lib api smoke + +default : lib api + +lib : $(LIBNAME).$(SHARED_EXT) +api : $(NAME)-c-api.h +smoke : smoke.exe + +all: lib smoke + +# Specializations of object file compilation for various architectures +$(NAME).cpp.o: CXXFLAGS += $(CXXLIBFLAGS) -DC_API_ONLY +$(NAME)-avx2.cpp.o: CXXFLAGS += $(CXXLIBFLAGS) -march=haswell +$(NAME)-skx.cpp.o: CXXFLAGS += $(CXXLIBFLAGS) -march=skylake-avx512 +$(NAME)-icl.cpp.o: CXXFLAGS += $(CXXLIBFLAGS) -march=icelake-client +$(NAME)-spr.cpp.o: CXXFLAGS += $(CXXLIBFLAGS) -march=sapphirerapids +smoke.cpp.o: CXXFLAGS += -I. + +ifdef BOOST_ROOT +smoke.cpp.o: CXXFLAGS += -I$(BOOST_ROOT) +endif + +# add a dependency to smoke +smoke.cpp.o : $(NAME)-c-api.h + +VPATH=../lib:. + +# Object files (note that CXXFLAGS set above is customized for each target) +%.cpp.o: %.cpp $(ALL_HEADERS) $(EXTRADEPS) + $(CXX) $(CXXFLAGS) -o $@ -c $< + +# smoke link +smoke.exe: smoke.cpp.o $(LIBNAME).$(SHARED_EXT) $(NAME)-c-api.h $(EXTRADEPS) + $(CXX) $< -o $@ $(DLLLINK) + +# Auto generated header +$(NAME)-c-api.h: ../lib/c-api-header-gen.h ../lib/c-api.sed $(EXTRADEPS) + cpp -P $< | sed -f ../lib/c-api.sed > $@ + +# SO shared library export file +$(LIBNAME).ver: ../lib/c-api-ver.h $(CONFIGFILE) + cpp -P $< | sed "/^$$/d; s/^c_xss/ c_xss/" > $@ + +# Shared library (SO or DLL) link +$(LIBNAME).$(SHARED_EXT): $(NAME).cpp.o $(LIB_OBJECTS) $(VERDEP) $(EXTRADEPS) + $(CXX) -o $@ $(NAME).cpp.o $(LIB_OBJECTS) $(LDFLAGS) + + +clean: + rm -rf *.o *.so *.dll *.lib *.h *.ver *.exe *.cfg diff --git a/make-c-api/smoke.cpp b/make-c-api/smoke.cpp new file mode 100644 index 0000000..831b6de --- /dev/null +++ b/make-c-api/smoke.cpp @@ -0,0 +1,627 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "x86simdsort-c-api.h" + +// if you want to test boost pdq_sort, you need to have boost installed and provide the include folder path to the compiler +#if __has_include() +#define TEST_BOOST +#include +constexpr bool hasBoost = true; +#else +constexpr bool hasBoost = false; +#endif + +template +void printVec(const std::vector& v) +{ + for (const auto& elem : v) + std::cout << elem << " "; + std::cout << std::endl; +} + +// name traits +template struct Name; +template <> struct Name { static constexpr const char* value = "int32_t"; }; +template <> struct Name { static constexpr const char* value = "uint32_t"; }; +template <> struct Name { static constexpr const char* value = "int64_t"; }; +template <> struct Name { static constexpr const char* value = "uint64_t"; }; +template <> struct Name { static constexpr const char* value = "float"; }; +template <> struct Name { static constexpr const char* value = "double"; }; + +enum SortAlgo { STL, PDQ, SIMD, ALGO_COUNT }; +const char *algoNames[ALGO_COUNT] = {"stl", "pdq", "simd"}; + +enum SortFun { SORT, NTH, PARTIAL, KVSORT, KVNTH, KVPARTIAL, FUN_COUNT }; +const char *funNames[FUN_COUNT] = {"sort", "nth", "partial", "kvsort", "kvnth", "kvpartial"}; + +template +using Algo = std::integral_constant; + +template +struct AlgoFun {}; + +template +std::string sortMsg(SortFun f, SortAlgo a, bool desc) +{ + std::ostringstream os; + os << funNames[f] << '-' << algoNames[a] << ", Descend: " << desc + << ", type: " << Name::value; + return os.str(); +} + +template +std::string violation(const std::vector& X, size_t i, size_t j, bool desc) +{ + std::ostringstream os; + auto show = [&](auto k) { + return (std::ostringstream{} << "X" << "[" << k << "](=" << X[k] << ")").str(); + }; + os << "\nordering violation: " << show(i) << (desc ? " >= " : " <= ") << show(j); + return os.str(); +} + +template +std::string violation(const std::vector &X, const std::vector &J, size_t i, size_t j, bool desc) +{ + std::ostringstream os; + auto show = [&](auto k) { + return (std::ostringstream{} << "X" << "[J[" << k << "](= " << J[k] << ")]=(" << X[J[k]] << ")").str(); + }; + os << "\nordering violation: " << show(i) << (desc ? " >= " : " <= ") + << show(j); + return os.str(); +} + +template +class LocalCopy +{ + std::vector m_ws; + T *m_data; + +public: + LocalCopy(const std::vector &src, T *ws) + : m_data(nullptr) + { + if (!ws) { + m_ws.assign(src.begin(), src.end()); + m_data = m_ws.data(); + } + else { + m_data = ws; + std::copy(src.begin(), src.end(), m_data); + } + } + + T *data() { return m_data; } +}; + + +template +struct Sorter; + +template +struct is_sorter_defined : std::false_type {}; + +// SFINAE-friendly detection if a specialization of Sorter is defined +template +struct is_sorter_defined, std::void_t::sort)>> : std::true_type {}; + +template +inline constexpr bool is_sorter_defined_v = is_sorter_defined>::value; + +template +struct Sorter> +{ + static void check(const std::vector& X) + { + auto it = std::adjacent_find(X.begin(), X.end(), [](auto a, auto b) { + return Descend ? a < b : a > b; + }); + if (it != X.end()) { + auto i = std::distance(X.begin(), it); + std::cout << sortMsg(SORT, A, Descend) + << violation(X, i, i + 1, Descend) + << std::endl; + //printVec(X); + std::exit(-1); + } + } + + static void sort(std::vector& X) + { + const size_t n = X.size(); + + if constexpr (A == STL) { + std::sort(X.begin(), X.end(), [](auto a, auto b) { + return Descend ? a > b : a < b; + }); + } +#ifdef TEST_BOOST + else if constexpr (hasBoost && A == PDQ) { + boost::sort::pdqsort_branchless(X.begin(), X.end(), [](auto a, auto b) { + return Descend ? a > b : a < b; + }); + } +#endif + else if constexpr (A == SIMD) { + if constexpr (std::is_floating_point_v) + xss::qsort(X.data(), n, false, Descend); + else + xss::qsort(X.data(), n, Descend); + } + else { + static_assert(false); + } + } +}; + + +template +struct Sorter> +{ + static void check(const std::vector &X, size_t k) + { + auto pivot = X[k]; + size_t i = 0, j = 0; + bool error = false; + + auto itLeft = std::find_if(X.begin(), X.begin() + k, [pivot](auto a) { + return Descend ? a < pivot : a > pivot; + }); + if (itLeft != X.begin() + k) { + error = true; + i = std::distance(X.begin(), itLeft); + j = k; + } + else { + auto itRight + = std::find_if(X.begin() + k + 1, X.end(), [pivot](auto a) { + return Descend ? a > pivot : a < pivot; + }); + if (itRight != X.end()) { + error = true; + i = k; + j = std::distance(X.begin(), itRight); + } + } + + if (error) { + std::cout << sortMsg(NTH, A, Descend) + << violation(X, i, j, Descend) + << std::endl; + //printVec(X); + std::exit(-1); + } + } + + static void sort(std::vector &X, size_t k) + { + const size_t n = X.size(); + + if constexpr (A == STL) { + std::nth_element(X.begin(), X.begin() + k, X.end(), [](auto a, auto b) { + return Descend ? a > b : a < b; + }); + } + else if constexpr (A == SIMD) { + if constexpr (std::is_floating_point_v) + xss::qselect(X.data(), k, n, false, Descend); + else + xss::qselect(X.data(), k, n, Descend); + } + else { + static_assert(false); + } + } +}; + +template +struct Sorter> { + static void check(const std::vector &X, size_t k) + { + size_t i = 0, j = 0; + bool error = false; + + auto itLeft = std::adjacent_find(X.begin(), X.begin() + k, [](auto a, auto b) { + return Descend ? a < b : a > b; + }); + if (itLeft != X.begin() + k) { + error = true; + i = std::distance(X.begin(), itLeft); + j = i + 1; + } + else { + auto pivot = X[k - 1]; + auto itRight = std::find_if(X.begin() + k, X.end(), [pivot](auto a) { + return Descend ? a > pivot : a < pivot; + }); + if (itRight != X.end()) { + error = true; + i = k - 1; + j = std::distance(X.begin(), itRight); + } + } + + if (error) { + std::cout << sortMsg(PARTIAL, A, Descend) + << violation(X, i, j, Descend) << std::endl; + //printVec(X); + std::exit(-1); + } + } + + static void sort(std::vector &X, size_t k) + { + const size_t n = X.size(); + + if constexpr (A == STL) { + std::partial_sort( + X.begin(), X.begin() + k, X.end(), [](auto a, auto b) { + return Descend ? a > b : a < b; + }); + } + else if constexpr (A == SIMD) { + if constexpr (std::is_floating_point_v) + xss::partial_qsort(X.data(), k, n, false, Descend); + else + xss::partial_qsort(X.data(), k, n, Descend); + } + else { + static_assert(false); + } + } +}; + +template +struct Sorter> +{ + static void check(const std::vector& X, const std::vector& J, T *) + { + auto it = std::adjacent_find(J.begin(), J.end(), [&X](auto i, auto j) { + return Descend ? X[i] < X[j] : X[i] > X[j]; + }); + if (it != J.end()) { + auto i = std::distance(J.begin(), it); + std::cout << sortMsg(KVSORT, A, Descend) + << violation(X, J, i, i + 1, Descend) + << std::endl; + //printVec(X); + //printVec(J); + std::exit(-1); + } + } + + static void sort(const std::vector &X, std::vector& J, T *ws) + { + std::iota(J.begin(), J.end(), 0); + + if constexpr (A == STL) { + std::sort(J.begin(), J.end(), [&X](auto i, auto j) { + return Descend ? X[i] > X[j] : X[i] < X[j]; + }); + } +#ifdef TEST_BOOST + else if constexpr (hasBoost && A == PDQ) { + boost::sort::pdqsort(J.begin(), J.end(), [&X](auto i, auto j) { + return Descend ? X[i] > X[j] : X[i] < X[j]; + }); + } +#endif + else if constexpr (A == SIMD) { + const size_t n = X.size(); + + LocalCopy keys(X, ws); + + if constexpr (std::is_floating_point_v) + xss::keyvalue_qsort(keys.data(), J.data(), n, false, false); + else + xss::keyvalue_qsort(keys.data(), J.data(), n, false); + + if (Descend) + std::ranges::reverse(J); + } + else { + static_assert(false); + } + } +}; + +template +struct Sorter> { + static void check(const std::vector &X, + const std::vector &J, + size_t k, + T *) + { + bool error = false; + size_t i = 0, j = 0; + auto pivot = X[J[k]]; + + auto itLeft = std::find_if(J.begin(), J.begin() + k, [&](auto j) { + return Descend ? X[j] < pivot : X[j] > pivot; + }); + if (itLeft != J.begin() + k) { + error = true; + i = std::distance(J.begin(), itLeft); + j = k; + } + else { + auto itRight + = std::find_if(J.begin() + k + 1, J.end(), [&](auto j) { + return Descend ? X[j] > pivot : X[j] < pivot; + }); + if (itRight != J.end()) { + error = true; + i = k; + j = std::distance(J.begin(), itRight); + } + } + + if (error) { + std::cout << sortMsg(KVNTH, A, Descend) + << violation(X, J, i, j, Descend) + << std::endl; + //printVec(X); + std::exit(-1); + } + } + + static void sort(std::vector &X, std::vector &J, size_t k, T* ws) + { + std::iota(J.begin(), J.end(), 0); + + if constexpr (A == STL) { + std::nth_element( + X.begin(), X.begin() + k, X.end(), [](auto a, auto b) { + return Descend ? a > b : a < b; + }); + } + else if constexpr (A == SIMD) { + const size_t n = X.size(); + + LocalCopy keys(X, ws); + + if constexpr (std::is_floating_point_v) + xss::keyvalue_qselect(keys.data(), J.data(), k, n, false, Descend); + else + xss::keyvalue_qselect(keys.data(), J.data(), k, n, Descend); + } + else { + static_assert(false); + } + } +}; + +template +struct Sorter> { + static void check(const std::vector &X, + const std::vector &J, + size_t k, + T *) + { + bool error = false; + size_t i = 0, j = 0; + + auto itLeft = std::adjacent_find(J.begin(), J.begin() + k, [&X](auto j, auto p) { + return Descend ? X[j] < X[p] : X[j] > X[p]; + }); + if (itLeft != J.begin() + k) { + error = true; + i = std::distance(J.begin(), itLeft); + j = i + 1; + } + else { + auto pivot = X[J[k - 1]]; + auto itRight + = std::find_if(J.begin() + k, J.end(), [&](auto j) { + return Descend ? X[j] > pivot : X[j] < pivot; + }); + if (itRight != J.end()) { + error = true; + i = k - 1; + j = std::distance(J.begin(), itRight); + } + } + + if (error) { + std::cout << sortMsg(KVPARTIAL, A, Descend) + << violation(X, J, i, j, Descend) << std::endl; + //printVec(X); + std::exit(-1); + } + } + + static void sort(std::vector &X, std::vector &J, size_t k, T *ws) + { + std::iota(J.begin(), J.end(), 0); + + if constexpr (A == STL) { + std::partial_sort(X.begin(), X.begin() + k, X.end(), [](auto a, auto b) { + return Descend ? a > b : a < b; + }); + } + else if constexpr (A == SIMD) { + const size_t n = X.size(); + + LocalCopy keys(X, ws); + + if constexpr (std::is_floating_point_v) + xss::keyvalue_partial_qsort(keys.data(), J.data(), k, n, false, Descend); + else + xss::keyvalue_partial_qsort(keys.data(), J.data(), k, n, Descend); + } + else { + static_assert(false); + } + } +}; + +// compare performance of sort_simd vs sort_classic for keys of type T +// use vectors of size 4, 8, 16, 32, ... +// report results in tabular format +template +void benchmark() +{ + using namespace std::chrono; + + struct Key { + SortFun f; + SortAlgo a; + bool ws; + size_t n; + bool operator<(const Key& o) const { + return std::tie(f,a,ws,n) < std::tie(o.f,o.a,o.ws,o.n); + } + }; + + std::map times; + + auto runTest = [&](AlgoFun&&, std::vector& x, Ts&&...args) { + if constexpr (is_sorter_defined_v) { + auto tStart = high_resolution_clock::now(); + Sorter::sort(x, std::forward(args)...); + auto tEnd = high_resolution_clock::now(); + auto [it, _] = times.insert({{F, A, UseWS, x.size()}, 0.0}); + it->second += duration_cast(tEnd - tStart).count(); + Sorter::check(x, std::forward(args)...); + } + }; + + std::cout << "TESTING: type: " << Name::value + << ", Order: " << (Descend ? "descending" : "ascending") << "\n"; \ + + constexpr size_t max_exp = 20; + for (size_t _n = 4; _n <= 1<<20; _n *= 2) { + size_t m = std::min(std::max((1 << max_exp) / _n, 2), 200); + for (auto n : {_n, 3*_n/2}) { + + std::vector X(n), Xcopy(n); + std::vector J(n); + std::vector ws(n); + + auto sortTest = [&](Algo&&) { + std::ranges::copy(X, Xcopy.begin()); + runTest(AlgoFun {}, Xcopy); + }; + + auto nthTest = [&](Algo &&) { + std::ranges::copy(X, Xcopy.begin()); + runTest(AlgoFun {}, Xcopy, n/2); + }; + + auto partialTest = [&](Algo &&) { + std::ranges::copy(X, Xcopy.begin()); + runTest(AlgoFun {}, Xcopy, n / 2); + }; + + auto kvTest = [&](Algo&&) { + runTest(AlgoFun {}, X, J, nullptr); + if constexpr (A == SIMD) + runTest(AlgoFun {}, X, J, ws.data()); + }; + + auto kvnthTest = [&](Algo &&) { + runTest(AlgoFun {}, X, J, n/2, nullptr); + if constexpr (A == SIMD) + runTest(AlgoFun {}, X, J, n/2, ws.data()); + }; + + auto kvpartialTest = [&](Algo &&) { + runTest(AlgoFun {}, X, J, n / 2, nullptr); + if constexpr (A == SIMD) + runTest(AlgoFun {}, X, J, n / 2, ws.data()); + }; + + auto allAlgoTests = [&](std::index_sequence&&) { + (sortTest(Algo<(SortAlgo)As>{}), ...); + (nthTest(Algo<(SortAlgo)As> {}), ...); + (partialTest(Algo<(SortAlgo)As> {}), ...); + (kvTest(Algo<(SortAlgo)As>{}), ...); + (kvnthTest(Algo<(SortAlgo)As> {}), ...); + (kvpartialTest(Algo<(SortAlgo)As> {}), ...); + }; + + for (size_t rep = 0; rep < m; ++rep) { + // Fill with random data for each repetition + for (auto &v : X) + v = static_cast(std::rand()); + + allAlgoTests(std::make_index_sequence{}); + } + + // take the average time over m repetitions + for (auto &[k, t] : times) + if (k.n == n) + t /= m; + } + } + + // print timing results in tabular format + + // extracts unique labels and sizes from timing results + std::set> labels; + std::set sizes; + for (auto [f, a, ws, n] : times | std::views::keys) { + labels.insert({f, a, ws}); + sizes.insert(n); + } + + // print labels + const int colSpacing = 18; + std::cout << std::setw(8) << "Size"; + for (auto [f, a, ws] : labels) { + std::ostringstream label; + label << funNames[f] << '-' << algoNames[a] << (ws ? "-WS" : ""); + std::cout << std::setw(colSpacing) << label.str(); + } + std::cout << std::endl; + + // print times + for (auto n : sizes) { + std::cout << std::setw(8) << n; + for (auto [f, a, ws] : labels) { + auto it = times.find({f, a, ws, n}); + if (it != times.end()) + std::cout << std::setw(colSpacing) << std::fixed << std::setprecision(2) << it->second; + else + std::cout << std::setw(colSpacing) << "N/A"; + } + std::cout << std::endl; + } +} + +template +void benchmarks() +{ + benchmark(); + benchmark(); + benchmark(); + benchmark(); + benchmark(); + benchmark(); +} + + +int main() +{ + benchmarks(); + benchmarks(); + + return 0; +} + diff --git a/make-c-api/smoke.sln b/make-c-api/smoke.sln new file mode 100644 index 0000000..37367f9 --- /dev/null +++ b/make-c-api/smoke.sln @@ -0,0 +1,31 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.14.36301.6 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "smoke", "smoke.vcxproj", "{7F3E858B-7848-4F2A-AFC8-B382BD6B087E}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {7F3E858B-7848-4F2A-AFC8-B382BD6B087E}.Debug|x64.ActiveCfg = Debug|x64 + {7F3E858B-7848-4F2A-AFC8-B382BD6B087E}.Debug|x64.Build.0 = Debug|x64 + {7F3E858B-7848-4F2A-AFC8-B382BD6B087E}.Debug|x86.ActiveCfg = Debug|Win32 + {7F3E858B-7848-4F2A-AFC8-B382BD6B087E}.Debug|x86.Build.0 = Debug|Win32 + {7F3E858B-7848-4F2A-AFC8-B382BD6B087E}.Release|x64.ActiveCfg = Release|x64 + {7F3E858B-7848-4F2A-AFC8-B382BD6B087E}.Release|x64.Build.0 = Release|x64 + {7F3E858B-7848-4F2A-AFC8-B382BD6B087E}.Release|x86.ActiveCfg = Release|Win32 + {7F3E858B-7848-4F2A-AFC8-B382BD6B087E}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {4A00F1C2-ABF8-4BB9-9509-9D3BAB64EFB0} + EndGlobalSection +EndGlobal diff --git a/make-c-api/smoke.vcxproj b/make-c-api/smoke.vcxproj new file mode 100644 index 0000000..d956d71 --- /dev/null +++ b/make-c-api/smoke.vcxproj @@ -0,0 +1,139 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 17.0 + Win32Proj + {7f3e858b-7848-4f2a-afc8-b382bd6b087e} + smoke + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + Level3 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + $(ProjectDir);c:/workspace/sdk/boost_1_87_0;%(AdditionalIncludeDirectories) + stdcpp20 + + + Console + true + $(ProjectDir) + x86simdsort-c-api.dll.lib;%(AdditionalDependencies) + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + $(ProjectDir);c:/workspace/sdk/boost_1_87_0;%(AdditionalIncludeDirectories) + stdcpp20 + + + Console + true + $(ProjectDir) + x86simdsort-c-api.dll.lib;%(AdditionalDependencies) + + + + + + + + + \ No newline at end of file diff --git a/make-c-api/smoke.vcxproj.user b/make-c-api/smoke.vcxproj.user new file mode 100644 index 0000000..0f14913 --- /dev/null +++ b/make-c-api/smoke.vcxproj.user @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/meson.build b/meson.build index 0b826f0..4373f28 100644 --- a/meson.build +++ b/meson.build @@ -1,7 +1,7 @@ project('x86-simd-sort', 'cpp', version : '7.0.x', license : 'BSD 3-clause', - default_options : ['cpp_std=c++17']) + default_options : ['cpp_std=c++20']) fs = import('fs') cpp = meson.get_compiler('cpp') src = include_directories('src') diff --git a/src/avx2-32bit-half.hpp b/src/avx2-32bit-half.hpp index 3646979..eaee8fd 100644 --- a/src/avx2-32bit-half.hpp +++ b/src/avx2-32bit-half.hpp @@ -30,146 +30,146 @@ struct avx2_half_vector { { return X86_SIMD_SORT_MIN_INT32; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm_set1_epi32(type_max()); } // TODO: this should broadcast bits as is? - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { auto allOnes = seti(-1, -1, -1, -1); return _mm_xor_si128(x, allOnes); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask_half(mask); } - static opmask_t convert_int_to_mask(uint64_t intMask) + static X86_SIMD_SORT_FORCE_INLINE opmask_t convert_int_to_mask(uint64_t intMask) { return convert_int_to_avx2_mask_half(intMask); } - static regi_t seti(int v1, int v2, int v3, int v4) + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4) { return _mm_set_epi32(v1, v2, v3, v4); } - static reg_t set(int v1, int v2, int v3, int v4) + static X86_SIMD_SORT_FORCE_INLINE reg_t set(int v1, int v2, int v3, int v4) { return _mm_set_epi32(v1, v2, v3, v4); } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t kxor_opmask(opmask_t x, opmask_t y) { return _mm_xor_si128(x, y); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { opmask_t equal = eq(x, y); opmask_t greater = _mm_cmpgt_epi32(x, y); return _mm_or_si128(equal, greater); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm_cmpeq_epi32(x, y); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mask_i64gather_epi32( src, (const int *)base, index, mask, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m128i index, void const *base) { return _mm_mask_i32gather_epi32( src, (const int *)base, index, mask, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm_loadu_si128((reg_t const *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm_max_epi32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return avx2_emu_mask_compressstoreu32_half(mem, mask, x); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm_maskload_epi32((const int *)mem, mask); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { reg_t dst = _mm_maskload_epi32((type_t *)mem, mask); return mask_mov(x, mask, dst); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), _mm_castsi128_ps(mask))); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm_maskstore_epi32((type_t *)mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm_min_epi32(x, y); } - static reg_t permutexvar(__m128i idx, reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m128i idx, reg_t ymm) { return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(ymm), idx)); } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const __m128i rev_index = _mm_set_epi32(NETWORK_REVERSE_4LANES); return permutexvar(rev_index, ymm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return avx2_emu_reduce_max32_half(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return avx2_emu_reduce_min32_half(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm_set1_epi32(v); } template - static reg_t shuffle(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t ymm) { return _mm_shuffle_epi32(ymm, mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm_storeu_si128((__m128i *)mem, x); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_4lanes>(x); } - static reg_t cast_from(__m128i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m128i v) { return v; } - static __m128i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m128i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return _mm_movemask_ps(_mm_castsi128_ps(k)) == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -197,137 +197,137 @@ struct avx2_half_vector { { return 0; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm_set1_epi32(type_max()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { auto allOnes = seti(-1, -1, -1, -1); return _mm_xor_si128(x, allOnes); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask_half(mask); } - static opmask_t convert_int_to_mask(uint64_t intMask) + static X86_SIMD_SORT_FORCE_INLINE opmask_t convert_int_to_mask(uint64_t intMask) { return convert_int_to_avx2_mask_half(intMask); } - static regi_t seti(int v1, int v2, int v3, int v4) + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4) { return _mm_set_epi32(v1, v2, v3, v4); } - static reg_t set(int v1, int v2, int v3, int v4) + static X86_SIMD_SORT_FORCE_INLINE reg_t set(int v1, int v2, int v3, int v4) { return _mm_set_epi32(v1, v2, v3, v4); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mask_i64gather_epi32( src, (const int *)base, index, mask, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m128i index, void const *base) { return _mm_mask_i32gather_epi32( src, (const int *)base, index, mask, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { reg_t maxi = max(x, y); return eq(maxi, x); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm_cmpeq_epi32(x, y); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm_loadu_si128((reg_t const *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm_max_epu32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return avx2_emu_mask_compressstoreu32_half(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { reg_t dst = _mm_maskload_epi32((const int *)mem, mask); return mask_mov(x, mask, dst); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), _mm_castsi128_ps(mask))); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm_maskstore_epi32((int *)mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm_min_epu32(x, y); } - static reg_t permutexvar(__m128i idx, reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m128i idx, reg_t ymm) { return _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(ymm), idx)); } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const __m128i rev_index = _mm_set_epi32(NETWORK_REVERSE_4LANES); return permutexvar(rev_index, ymm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return avx2_emu_reduce_max32_half(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return avx2_emu_reduce_min32_half(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm_set1_epi32(v); } template - static reg_t shuffle(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t ymm) { return _mm_shuffle_epi32(ymm, mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm_storeu_si128((__m128i *)mem, x); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_4lanes>(x); } - static reg_t cast_from(__m128i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m128i v) { return v; } - static __m128i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m128i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return _mm_movemask_ps(_mm_castsi128_ps(k)) == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -355,50 +355,50 @@ struct avx2_half_vector { { return -X86_SIMD_SORT_INFINITYF; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm_set1_ps(type_max()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { auto allOnes = seti(-1, -1, -1, -1); return _mm_xor_si128(x, allOnes); } - static regi_t seti(int v1, int v2, int v3, int v4) + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4) { return _mm_set_epi32(v1, v2, v3, v4); } - static reg_t set(float v1, float v2, float v3, float v4) + static X86_SIMD_SORT_FORCE_INLINE reg_t set(float v1, float v2, float v3, float v4) { return _mm_set_ps(v1, v2, v3, v4); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm_maskload_ps((const float *)mem, mask); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm_castps_si128(_mm_cmp_ps(x, y, _CMP_GE_OQ)); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm_castps_si128(_mm_cmp_ps(x, y, _CMP_EQ_OQ)); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask_half(mask); } - static opmask_t convert_int_to_mask(uint64_t intMask) + static X86_SIMD_SORT_FORCE_INLINE opmask_t convert_int_to_mask(uint64_t intMask) { return convert_int_to_avx2_mask_half(intMask); } - static int32_t convert_mask_to_int(opmask_t mask) + static X86_SIMD_SORT_FORCE_INLINE int32_t convert_mask_to_int(opmask_t mask) { return convert_avx2_mask_to_int_half(mask); } template - static opmask_t fpclass(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t fpclass(reg_t x) { if constexpr (type == (0x01 | 0x80)) { return _mm_castps_si128(_mm_cmp_ps(x, x, _CMP_UNORD_Q)); @@ -408,99 +408,99 @@ struct avx2_half_vector { } } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mask_i64gather_ps( src, (const float *)base, index, _mm_castsi128_ps(mask), scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m128i index, void const *base) { return _mm_mask_i32gather_ps( src, (const float *)base, index, _mm_castsi128_ps(mask), scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm_loadu_ps((float const *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm_max_ps(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return avx2_emu_mask_compressstoreu32_half(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { reg_t dst = _mm_maskload_ps((type_t *)mem, mask); return mask_mov(x, mask, dst); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm_blendv_ps(x, y, _mm_castsi128_ps(mask)); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm_maskstore_ps((type_t *)mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm_min_ps(x, y); } - static reg_t permutexvar(__m128i idx, reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m128i idx, reg_t ymm) { return _mm_permutevar_ps(ymm, idx); } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const __m128i rev_index = _mm_set_epi32(NETWORK_REVERSE_4LANES); return permutexvar(rev_index, ymm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return avx2_emu_reduce_max32_half(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return avx2_emu_reduce_min32_half(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm_set1_ps(v); } template - static reg_t shuffle(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t ymm) { return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(ymm), mask)); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm_storeu_ps((float *)mem, x); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_4lanes>(x); } - static reg_t cast_from(__m128i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m128i v) { return _mm_castsi128_ps(v); } - static __m128i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m128i cast_to(reg_t v) { return _mm_castps_si128(v); } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return _mm_movemask_ps(_mm_castsi128_ps(k)) == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -512,7 +512,7 @@ struct avx2_half_vector { struct avx2_32bit_half_swizzle_ops { template - X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) { if constexpr (scale == 2) { return vtype::template shuffle<0b10110001>(reg); @@ -526,7 +526,7 @@ struct avx2_32bit_half_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t reverse_n(typename vtype::reg_t reg) { __m128i v = vtype::cast_to(reg); @@ -543,7 +543,7 @@ struct avx2_32bit_half_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) { __m128i v1 = vtype::cast_to(reg); diff --git a/src/avx2-32bit-qsort.hpp b/src/avx2-32bit-qsort.hpp index 7c7218e..09da5a3 100644 --- a/src/avx2-32bit-qsort.hpp +++ b/src/avx2-32bit-qsort.hpp @@ -36,146 +36,146 @@ struct avx2_vector { { return X86_SIMD_SORT_MIN_INT32; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm256_set1_epi32(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm256_set1_epi32(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1); return _mm256_xor_si256(x, allOnes); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask(mask); } - static opmask_t convert_int_to_mask(uint64_t intMask) + static X86_SIMD_SORT_FORCE_INLINE opmask_t convert_int_to_mask(uint64_t intMask) { return convert_int_to_avx2_mask(intMask); } - static ymmi_t + static X86_SIMD_SORT_FORCE_INLINE ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t kxor_opmask(opmask_t x, opmask_t y) { return _mm256_xor_si256(x, y); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { opmask_t equal = eq(x, y); opmask_t greater = _mm256_cmpgt_epi32(x, y); return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(equal), _mm256_castsi256_ps(greater))); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm256_cmpeq_epi32(x, y); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mask_i32gather_epi32(src, base, index, mask, scale); } template - static reg_t i64gather(__m256i index, void const *base) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(__m256i index, void const *base) { return _mm256_i32gather_epi32((int const *)base, index, scale); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm256_loadu_si256((reg_t const *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm256_max_epi32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return avx2_emu_mask_compressstoreu32(mem, mask, x); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm256_maskload_epi32((const int *)mem, mask); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { reg_t dst = _mm256_maskload_epi32((type_t *)mem, mask); return mask_mov(x, mask, dst); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y), _mm256_castsi256_ps(mask))); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm256_maskstore_epi32((type_t *)mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm256_min_epi32(x, y); } - static reg_t permutexvar(__m256i idx, reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m256i idx, reg_t ymm) { return _mm256_permutevar8x32_epi32(ymm, idx); //return avx2_emu_permutexvar_epi32(idx, ymm); } - static reg_t permutevar(reg_t ymm, __m256i idx) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutevar(reg_t ymm, __m256i idx) { return _mm256_permutevar8x32_epi32(ymm, idx); } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES); return permutexvar(rev_index, ymm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return avx2_emu_reduce_max32(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return avx2_emu_reduce_min32(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm256_set1_epi32(v); } template - static reg_t shuffle(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t ymm) { return _mm256_shuffle_epi32(ymm, mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_8lanes>(x); } - static reg_t cast_from(__m256i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m256i v) { return v; } - static __m256i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m256i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -209,135 +209,135 @@ struct avx2_vector { { return 0; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm256_set1_epi32(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm256_set1_epi32(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1); return _mm256_xor_si256(x, allOnes); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask(mask); } - static opmask_t convert_int_to_mask(uint64_t intMask) + static X86_SIMD_SORT_FORCE_INLINE opmask_t convert_int_to_mask(uint64_t intMask) { return convert_int_to_avx2_mask(intMask); } - static ymmi_t + static X86_SIMD_SORT_FORCE_INLINE ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mask_i32gather_epi32(src, base, index, mask, scale); } template - static reg_t i64gather(__m256i index, void const *base) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(__m256i index, void const *base) { return _mm256_i32gather_epi32((int const *)base, index, scale); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { reg_t maxi = max(x, y); return eq(maxi, x); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm256_cmpeq_epi32(x, y); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm256_loadu_si256((reg_t const *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm256_max_epu32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return avx2_emu_mask_compressstoreu32(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { reg_t dst = _mm256_maskload_epi32((const int *)mem, mask); return mask_mov(x, mask, dst); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y), _mm256_castsi256_ps(mask))); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm256_maskstore_epi32((int *)mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm256_min_epu32(x, y); } - static reg_t permutexvar(__m256i idx, reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m256i idx, reg_t ymm) { return _mm256_permutevar8x32_epi32(ymm, idx); } - static reg_t permutevar(reg_t ymm, __m256i idx) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutevar(reg_t ymm, __m256i idx) { return _mm256_permutevar8x32_epi32(ymm, idx); } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES); return permutexvar(rev_index, ymm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return avx2_emu_reduce_max32(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return avx2_emu_reduce_min32(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm256_set1_epi32(v); } template - static reg_t shuffle(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t ymm) { return _mm256_shuffle_epi32(ymm, mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_8lanes>(x); } - static reg_t cast_from(__m256i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m256i v) { return v; } - static __m256i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m256i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -371,52 +371,52 @@ struct avx2_vector { { return -X86_SIMD_SORT_INFINITYF; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm256_set1_ps(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm256_set1_ps(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1); return _mm256_xor_si256(x, allOnes); } - static ymmi_t + static X86_SIMD_SORT_FORCE_INLINE ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm256_maskload_ps((const float *)mem, mask); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask(mask); } - static opmask_t convert_int_to_mask(uint64_t intMask) + static X86_SIMD_SORT_FORCE_INLINE opmask_t convert_int_to_mask(uint64_t intMask) { return convert_int_to_avx2_mask(intMask); } - static int32_t convert_mask_to_int(opmask_t mask) + static X86_SIMD_SORT_FORCE_INLINE int32_t convert_mask_to_int(opmask_t mask) { return convert_avx2_mask_to_int(mask); } template - static opmask_t fpclass(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t fpclass(reg_t x) { if constexpr (type != (0x01 | 0x80)) { static_assert(type == (0x01 | 0x80), "should not reach here"); @@ -424,7 +424,7 @@ struct avx2_vector { return _mm256_castps_si256(_mm256_cmp_ps(x, x, _CMP_UNORD_Q)); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mask_i32gather_ps( @@ -432,91 +432,91 @@ struct avx2_vector { ; } template - static reg_t i64gather(__m256i index, void const *base) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(__m256i index, void const *base) { return _mm256_i32gather_ps((float *)base, index, scale); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm256_loadu_ps((float const *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm256_max_ps(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return avx2_emu_mask_compressstoreu32(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { reg_t dst = _mm256_maskload_ps((type_t *)mem, mask); return mask_mov(x, mask, dst); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm256_blendv_ps(x, y, _mm256_castsi256_ps(mask)); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm256_maskstore_ps((type_t *)mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm256_min_ps(x, y); } - static reg_t permutexvar(__m256i idx, reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m256i idx, reg_t ymm) { return _mm256_permutevar8x32_ps(ymm, idx); } - static reg_t permutevar(reg_t ymm, __m256i idx) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutevar(reg_t ymm, __m256i idx) { return _mm256_permutevar8x32_ps(ymm, idx); } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES); return permutexvar(rev_index, ymm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return avx2_emu_reduce_max32(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return avx2_emu_reduce_min32(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm256_set1_ps(v); } template - static reg_t shuffle(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t ymm) { return _mm256_castsi256_ps( _mm256_shuffle_epi32(_mm256_castps_si256(ymm), mask)); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm256_storeu_ps((float *)mem, x); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_8lanes>(x); } - static reg_t cast_from(__m256i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m256i v) { return _mm256_castsi256_ps(v); } - static __m256i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m256i cast_to(reg_t v) { return _mm256_castps_si256(v); } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -528,7 +528,7 @@ struct avx2_vector { struct avx2_32bit_swizzle_ops { template - X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) { __m256i v = vtype::cast_to(reg); @@ -553,7 +553,7 @@ struct avx2_32bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t reverse_n(typename vtype::reg_t reg) { __m256i v = vtype::cast_to(reg); @@ -576,7 +576,7 @@ struct avx2_32bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) { __m256i v1 = vtype::cast_to(reg); diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index 2af97d6..4d66b9c 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -37,85 +37,85 @@ struct avx2_vector { { return X86_SIMD_SORT_MIN_INT64; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm256_set1_epi64x(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm256_set1_epi64x(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF'FFFF'FFFF); return _mm256_xor_si256(x, allTrue); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask_64bit(mask); } - static opmask_t convert_int_to_mask(uint64_t intMask) + static X86_SIMD_SORT_FORCE_INLINE opmask_t convert_int_to_mask(uint64_t intMask) { return convert_int_to_avx2_mask_64bit(intMask); } - static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4) + static X86_SIMD_SORT_FORCE_INLINE ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4) { return _mm256_set_epi64x(v1, v2, v3, v4); } - static reg_t set(type_t v1, type_t v2, type_t v3, type_t v4) + static X86_SIMD_SORT_FORCE_INLINE reg_t set(type_t v1, type_t v2, type_t v3, type_t v4) { return _mm256_set_epi64x(v1, v2, v3, v4); } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t kxor_opmask(opmask_t x, opmask_t y) { return _mm256_xor_si256(x, y); } - static opmask_t gt(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t gt(reg_t x, reg_t y) { return _mm256_cmpgt_epi64(x, y); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { opmask_t equal = eq(x, y); opmask_t greater = _mm256_cmpgt_epi64(x, y); return _mm256_or_si256(equal, greater); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm256_cmpeq_epi64(x, y); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mask_i64gather_epi64( src, (const long long int *)base, index, mask, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m128i index, void const *base) { return _mm256_mask_i32gather_epi64( src, (const long long int *)base, index, mask, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm256_loadu_si256((reg_t const *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return avx2_emu_max(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return avx2_emu_mask_compressstoreu64(mem, mask, x); } - static int32_t double_compressstore(void *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int32_t double_compressstore(void *left_addr, void *right_addr, opmask_t k, reg_t reg) @@ -123,79 +123,79 @@ struct avx2_vector { return avx2_double_compressstore64( left_addr, right_addr, k, reg); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm256_maskload_epi64((const long long int *)mem, mask); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { reg_t dst = _mm256_maskload_epi64((long long int *)mem, mask); return mask_mov(x, mask, dst); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x), _mm256_castsi256_pd(y), _mm256_castsi256_pd(mask))); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm256_maskstore_epi64((long long int *)mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return avx2_emu_min(x, y); } template - static reg_t permutexvar(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(reg_t ymm) { return _mm256_permute4x64_epi64(ymm, idx); } template - static reg_t permutevar(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutevar(reg_t ymm) { return _mm256_permute4x64_epi64(ymm, idx); } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const int32_t rev_index = SHUFFLE_MASK(0, 1, 2, 3); return permutexvar(ymm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return avx2_emu_reduce_max64(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return avx2_emu_reduce_min64(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm256_set1_epi64x(v); } template - static reg_t shuffle(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t ymm) { return _mm256_castpd_si256( _mm256_permute_pd(_mm256_castsi256_pd(ymm), mask)); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_4lanes>(x); } - static reg_t cast_from(__m256i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m256i v) { return v; } - static __m256i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m256i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0; } @@ -229,58 +229,58 @@ struct avx2_vector { { return _mm256_set1_epi64x(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm256_set1_epi64x(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF'FFFF'FFFF); return _mm256_xor_si256(x, allTrue); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask_64bit(mask); } - static opmask_t convert_int_to_mask(uint64_t intMask) + static X86_SIMD_SORT_FORCE_INLINE opmask_t convert_int_to_mask(uint64_t intMask) { return convert_int_to_avx2_mask_64bit(intMask); } - static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4) + static X86_SIMD_SORT_FORCE_INLINE ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4) { return _mm256_set_epi64x(v1, v2, v3, v4); } - static reg_t set(type_t v1, type_t v2, type_t v3, type_t v4) + static X86_SIMD_SORT_FORCE_INLINE reg_t set(type_t v1, type_t v2, type_t v3, type_t v4) { return _mm256_set_epi64x(v1, v2, v3, v4); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mask_i64gather_epi64( src, (const long long int *)base, index, mask, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m128i index, void const *base) { return _mm256_mask_i32gather_epi64( src, (const long long int *)base, index, mask, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]); } - static opmask_t gt(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t gt(reg_t x, reg_t y) { const __m256i offset = _mm256_set1_epi64x(0x8000000000000000); x = _mm256_xor_si256(x, offset); y = _mm256_xor_si256(y, offset); return _mm256_cmpgt_epi64(x, y); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { opmask_t equal = eq(x, y); const __m256i offset = _mm256_set1_epi64x(0x8000000000000000); @@ -289,23 +289,23 @@ struct avx2_vector { opmask_t greater = _mm256_cmpgt_epi64(x, y); return _mm256_or_si256(equal, greater); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm256_cmpeq_epi64(x, y); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm256_loadu_si256((reg_t const *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return avx2_emu_max(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return avx2_emu_mask_compressstoreu64(mem, mask, x); } - static int32_t double_compressstore(void *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int32_t double_compressstore(void *left_addr, void *right_addr, opmask_t k, reg_t reg) @@ -313,75 +313,75 @@ struct avx2_vector { return avx2_double_compressstore64( left_addr, right_addr, k, reg); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { reg_t dst = _mm256_maskload_epi64((const long long int *)mem, mask); return mask_mov(x, mask, dst); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(x), _mm256_castsi256_pd(y), _mm256_castsi256_pd(mask))); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm256_maskstore_epi64((long long int *)mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return avx2_emu_min(x, y); } template - static reg_t permutexvar(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(reg_t ymm) { return _mm256_permute4x64_epi64(ymm, idx); } template - static reg_t permutevar(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutevar(reg_t ymm) { return _mm256_permute4x64_epi64(ymm, idx); } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const int32_t rev_index = SHUFFLE_MASK(0, 1, 2, 3); return permutexvar(ymm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return avx2_emu_reduce_max64(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return avx2_emu_reduce_min64(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm256_set1_epi64x(v); } template - static reg_t shuffle(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t ymm) { return _mm256_castpd_si256( _mm256_permute_pd(_mm256_castsi256_pd(ymm), mask)); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_4lanes>(x); } - static reg_t cast_from(__m256i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m256i v) { return v; } - static __m256i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m256i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0; } @@ -423,62 +423,62 @@ struct avx2_vector { { return -X86_SIMD_SORT_INFINITY; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm256_set1_pd(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm256_set1_pd(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF'FFFF'FFFF); return _mm256_xor_si256(x, allTrue); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask_64bit(mask); } - static opmask_t convert_int_to_mask(uint64_t intMask) + static X86_SIMD_SORT_FORCE_INLINE opmask_t convert_int_to_mask(uint64_t intMask) { return convert_int_to_avx2_mask_64bit(intMask); } - static int32_t convert_mask_to_int(opmask_t mask) + static X86_SIMD_SORT_FORCE_INLINE int32_t convert_mask_to_int(opmask_t mask) { return convert_avx2_mask_to_int_64bit(mask); } template - static opmask_t fpclass(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t fpclass(reg_t x) { if constexpr (type != (0x01 | 0x80)) { static_assert(type == (0x01 | 0x80), "should not reach here"); } return _mm256_castpd_si256(_mm256_cmp_pd(x, x, _CMP_UNORD_Q)); } - static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4) + static X86_SIMD_SORT_FORCE_INLINE ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4) { return _mm256_set_epi64x(v1, v2, v3, v4); } - static reg_t set(type_t v1, type_t v2, type_t v3, type_t v4) + static X86_SIMD_SORT_FORCE_INLINE reg_t set(type_t v1, type_t v2, type_t v3, type_t v4) { return _mm256_set_pd(v1, v2, v3, v4); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm256_maskload_pd((const double *)mem, mask); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm256_castpd_si256(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm256_castpd_si256(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mask_i64gather_pd(src, @@ -489,7 +489,7 @@ struct avx2_vector { ; } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m128i index, void const *base) { return _mm256_mask_i32gather_pd(src, @@ -499,23 +499,23 @@ struct avx2_vector { scale); ; } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[3]], arr[ind[2]], arr[ind[1]], arr[ind[0]]); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm256_loadu_pd((double const *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm256_max_pd(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return avx2_emu_mask_compressstoreu64(mem, mask, x); } - static int32_t double_compressstore(void *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int32_t double_compressstore(void *left_addr, void *right_addr, opmask_t k, reg_t reg) @@ -523,72 +523,72 @@ struct avx2_vector { return avx2_double_compressstore64( left_addr, right_addr, k, reg); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { reg_t dst = _mm256_maskload_pd((type_t *)mem, mask); return mask_mov(x, mask, dst); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm256_blendv_pd(x, y, _mm256_castsi256_pd(mask)); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm256_maskstore_pd((type_t *)mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm256_min_pd(x, y); } template - static reg_t permutexvar(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(reg_t ymm) { return _mm256_permute4x64_pd(ymm, idx); } template - static reg_t permutevar(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutevar(reg_t ymm) { return _mm256_permute4x64_pd(ymm, idx); } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const int32_t rev_index = SHUFFLE_MASK(0, 1, 2, 3); return permutexvar(ymm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return avx2_emu_reduce_max64(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return avx2_emu_reduce_min64(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm256_set1_pd(v); } template - static reg_t shuffle(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t ymm) { return _mm256_permute_pd(ymm, mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm256_storeu_pd((double *)mem, x); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_4lanes>(x); } - static reg_t cast_from(__m256i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m256i v) { return _mm256_castsi256_pd(v); } - static __m256i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m256i cast_to(reg_t v) { return _mm256_castpd_si256(v); } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0; } @@ -596,7 +596,7 @@ struct avx2_vector { struct avx2_64bit_swizzle_ops { template - X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) { __m256i v = vtype::cast_to(reg); @@ -614,7 +614,7 @@ struct avx2_64bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t reverse_n(typename vtype::reg_t reg) { __m256i v = vtype::cast_to(reg); @@ -631,7 +631,7 @@ struct avx2_64bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) { __m256d v1 = _mm256_castsi256_pd(vtype::cast_to(reg)); diff --git a/src/avx2-emu-funcs.hpp b/src/avx2-emu-funcs.hpp index a30da7c..9baa126 100644 --- a/src/avx2-emu-funcs.hpp +++ b/src/avx2-emu-funcs.hpp @@ -143,40 +143,40 @@ constexpr auto avx2_compressstore_lut64_perm constexpr auto avx2_compressstore_lut64_left = avx2_compressstore_lut64_gen.second; -X86_SIMD_SORT_INLINE +X86_SIMD_SORT_FORCE_INLINE __m256i convert_int_to_avx2_mask(int32_t m) { return _mm256_loadu_si256( (const __m256i *)avx2_mask_helper_lut32[m].data()); } -X86_SIMD_SORT_INLINE +X86_SIMD_SORT_FORCE_INLINE int32_t convert_avx2_mask_to_int(__m256i m) { return _mm256_movemask_ps(_mm256_castsi256_ps(m)); } -X86_SIMD_SORT_INLINE +X86_SIMD_SORT_FORCE_INLINE __m256i convert_int_to_avx2_mask_64bit(int32_t m) { return _mm256_loadu_si256( (const __m256i *)avx2_mask_helper_lut64[m].data()); } -X86_SIMD_SORT_INLINE +X86_SIMD_SORT_FORCE_INLINE int32_t convert_avx2_mask_to_int_64bit(__m256i m) { return _mm256_movemask_pd(_mm256_castsi256_pd(m)); } -X86_SIMD_SORT_INLINE +X86_SIMD_SORT_FORCE_INLINE __m128i convert_int_to_avx2_mask_half(int32_t m) { return _mm_loadu_si128( (const __m128i *)avx2_mask_helper_lut32_half[m].data()); } -X86_SIMD_SORT_INLINE +X86_SIMD_SORT_FORCE_INLINE int32_t convert_avx2_mask_to_int_half(__m128i m) { return _mm_movemask_ps(_mm_castsi128_ps(m)); @@ -184,7 +184,7 @@ int32_t convert_avx2_mask_to_int_half(__m128i m) // Emulators for intrinsics missing from AVX2 compared to AVX512 template -T avx2_emu_reduce_max32(typename avx2_vector::reg_t x) +X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_max32(typename avx2_vector::reg_t x) { using vtype = avx2_vector; using reg_t = typename vtype::reg_t; @@ -199,7 +199,7 @@ T avx2_emu_reduce_max32(typename avx2_vector::reg_t x) } template -T avx2_emu_reduce_max32_half(typename avx2_half_vector::reg_t x) +X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_max32_half(typename avx2_half_vector::reg_t x) { using vtype = avx2_half_vector; using reg_t = typename vtype::reg_t; @@ -212,7 +212,7 @@ T avx2_emu_reduce_max32_half(typename avx2_half_vector::reg_t x) } template -T avx2_emu_reduce_min32(typename avx2_vector::reg_t x) +X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_min32(typename avx2_vector::reg_t x) { using vtype = avx2_vector; using reg_t = typename vtype::reg_t; @@ -227,7 +227,7 @@ T avx2_emu_reduce_min32(typename avx2_vector::reg_t x) } template -T avx2_emu_reduce_min32_half(typename avx2_half_vector::reg_t x) +X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_min32_half(typename avx2_half_vector::reg_t x) { using vtype = avx2_half_vector; using reg_t = typename vtype::reg_t; @@ -240,7 +240,7 @@ T avx2_emu_reduce_min32_half(typename avx2_half_vector::reg_t x) } template -T avx2_emu_reduce_max64(typename avx2_vector::reg_t x) +X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_max64(typename avx2_vector::reg_t x) { using vtype = avx2_vector; typename vtype::reg_t inter1 = vtype::max( @@ -251,7 +251,7 @@ T avx2_emu_reduce_max64(typename avx2_vector::reg_t x) } template -T avx2_emu_reduce_min64(typename avx2_vector::reg_t x) +X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_min64(typename avx2_vector::reg_t x) { using vtype = avx2_vector; typename vtype::reg_t inter1 = vtype::min( @@ -262,7 +262,7 @@ T avx2_emu_reduce_min64(typename avx2_vector::reg_t x) } template -void avx2_emu_mask_compressstoreu32(void *base_addr, +X86_SIMD_SORT_FORCE_INLINE void avx2_emu_mask_compressstoreu32(void *base_addr, typename avx2_vector::opmask_t k, typename avx2_vector::reg_t reg) { @@ -282,7 +282,7 @@ void avx2_emu_mask_compressstoreu32(void *base_addr, } template -void avx2_emu_mask_compressstoreu32_half( +X86_SIMD_SORT_FORCE_INLINE void avx2_emu_mask_compressstoreu32_half( void *base_addr, typename avx2_half_vector::opmask_t k, typename avx2_half_vector::reg_t reg) @@ -305,7 +305,7 @@ void avx2_emu_mask_compressstoreu32_half( } template -void avx2_emu_mask_compressstoreu64(void *base_addr, +X86_SIMD_SORT_FORCE_INLINE void avx2_emu_mask_compressstoreu64(void *base_addr, typename avx2_vector::opmask_t k, typename avx2_vector::reg_t reg) { @@ -326,7 +326,7 @@ void avx2_emu_mask_compressstoreu64(void *base_addr, } template -int avx2_double_compressstore32(void *left_addr, +X86_SIMD_SORT_FORCE_INLINE int avx2_double_compressstore32(void *left_addr, void *right_addr, typename avx2_vector::opmask_t k, typename avx2_vector::reg_t reg) @@ -349,7 +349,7 @@ int avx2_double_compressstore32(void *left_addr, } template -int avx2_double_compressstore32_half(void *left_addr, +X86_SIMD_SORT_FORCE_INLINE int avx2_double_compressstore32_half(void *left_addr, void *right_addr, typename avx2_half_vector::opmask_t k, typename avx2_half_vector::reg_t reg) @@ -373,7 +373,7 @@ int avx2_double_compressstore32_half(void *left_addr, } template -int32_t avx2_double_compressstore64(void *left_addr, +X86_SIMD_SORT_FORCE_INLINE int32_t avx2_double_compressstore64(void *left_addr, void *right_addr, typename avx2_vector::opmask_t k, typename avx2_vector::reg_t reg) @@ -397,7 +397,7 @@ int32_t avx2_double_compressstore64(void *left_addr, } template -typename avx2_vector::reg_t avx2_emu_max(typename avx2_vector::reg_t x, +X86_SIMD_SORT_FORCE_INLINE typename avx2_vector::reg_t avx2_emu_max(typename avx2_vector::reg_t x, typename avx2_vector::reg_t y) { using vtype = avx2_vector; @@ -408,7 +408,7 @@ typename avx2_vector::reg_t avx2_emu_max(typename avx2_vector::reg_t x, } template -typename avx2_vector::reg_t avx2_emu_min(typename avx2_vector::reg_t x, +X86_SIMD_SORT_FORCE_INLINE typename avx2_vector::reg_t avx2_emu_min(typename avx2_vector::reg_t x, typename avx2_vector::reg_t y) { using vtype = avx2_vector; diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h index e1a76d3..18d6dd0 100644 --- a/src/avx512-16bit-common.h +++ b/src/avx512-16bit-common.h @@ -9,7 +9,7 @@ struct avx512_16bit_swizzle_ops { template - X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) { __m512i v = vtype::cast_to(reg); @@ -41,7 +41,7 @@ struct avx512_16bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t reverse_n(typename vtype::reg_t reg) { __m512i v = vtype::cast_to(reg); @@ -82,7 +82,7 @@ struct avx512_16bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) { __m512i v1 = vtype::cast_to(reg); diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index fbe1856..715e202 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -34,20 +34,20 @@ struct zmm_vector { { return X86_SIMD_SORT_NEGINFINITYH; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_epi16(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_epi16(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask32(x); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { reg_t sign_x = _mm512_and_si512(x, _mm512_set1_epi16(0x8000)); reg_t sign_y = _mm512_and_si512(y, _mm512_set1_epi16(0x8000)); @@ -77,49 +77,49 @@ struct zmm_vector { exp_eq, mant_x, mant_y, _MM_CMPINT_NLT); return _kxor_mask32(mask_ge, neg); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmpeq_epu16_mask(x, y); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static int32_t convert_mask_to_int(opmask_t mask) + static X86_SIMD_SORT_FORCE_INLINE int32_t convert_mask_to_int(opmask_t mask) { return mask; } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_mask_mov_epi16(y, ge(x, y), x); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { // AVX512_VBMI2 return _mm512_mask_compressstoreu_epi16(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { // AVX512BW return _mm512_mask_loadu_epi16(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_mask_mov_epi16(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_epi16(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_mask_mov_epi16(x, ge(x, y), y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_epi16(idx, zmm); } @@ -133,13 +133,13 @@ struct zmm_vector { // __m128 xmm2 = _mm_cvtph_ps(xmm); // return _mm_cvtss_f32(xmm2); //} - static type_t float_to_uint16(float val) + static X86_SIMD_SORT_FORCE_INLINE type_t float_to_uint16(float val) { __m128 xmm = _mm_load_ss(&val); __m128i xmm2 = _mm_cvtps_ph(xmm, _MM_FROUND_NO_EXC); return _mm_extract_epi16(xmm2, 0); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0)); __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1)); @@ -147,7 +147,7 @@ struct zmm_vector { float hi_max = _mm512_reduce_max_ps(hi); return float_to_uint16(std::max(lo_max, hi_max)); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0)); __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1)); @@ -155,43 +155,43 @@ struct zmm_vector { float hi_max = _mm512_reduce_min_ps(hi); return float_to_uint16(std::min(lo_max, hi_max)); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_epi16(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask); return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { return _mm512_storeu_si512(mem, x); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { constexpr static uint16_t arr[] = {NETWORK_REVERSE_32LANES}; const auto rev_index = _mm512_loadu_si512(arr); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_32lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return v; } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -226,66 +226,66 @@ struct zmm_vector { { return X86_SIMD_SORT_MIN_INT16; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_epi16(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_epi16(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask32(x); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmpeq_epi16_mask(x, y); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_max_epi16(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { // AVX512_VBMI2 return _mm512_mask_compressstoreu_epi16(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { // AVX512BW return _mm512_mask_loadu_epi16(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_mask_mov_epi16(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_epi16(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_min_epi16(x, y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_epi16(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { reg_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0)); reg_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1)); @@ -293,7 +293,7 @@ struct zmm_vector { type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi); return std::max(lo_max, hi_max); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { reg_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0)); reg_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1)); @@ -301,43 +301,43 @@ struct zmm_vector { type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi); return std::min(lo_min, hi_min); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_epi16(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask); return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { return _mm512_storeu_si512(mem, x); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { constexpr static uint16_t arr[] = {NETWORK_REVERSE_32LANES}; const auto rev_index = _mm512_loadu_si512(arr); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_32lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return v; } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -371,64 +371,64 @@ struct zmm_vector { { return 0; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_epi16(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_epi16(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask32(x); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmpeq_epu16_mask(x, y); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_max_epu16(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_compressstoreu_epi16(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm512_mask_loadu_epi16(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_mask_mov_epi16(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_epi16(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_min_epu16(x, y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_epi16(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { reg_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0)); reg_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1)); @@ -436,7 +436,7 @@ struct zmm_vector { type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi); return std::max(lo_max, hi_max); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { reg_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0)); reg_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1)); @@ -444,43 +444,43 @@ struct zmm_vector { type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi); return std::min(lo_min, hi_min); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_epi16(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask); return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { return _mm512_storeu_si512(mem, x); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { constexpr static uint16_t arr[] = {NETWORK_REVERSE_32LANES}; const auto rev_index = _mm512_loadu_si512(arr); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_32lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return v; } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -590,7 +590,7 @@ avx512_qsort_fp16(uint16_t *arr, if (arrsize > 1) { arrsize_t nan_count = 0; - if (UNLIKELY(hasnan)) { + if (hasnan) [[unlikely]] { nan_count = replace_nan_with_inf(arr, arrsize); } if (descending) { @@ -623,7 +623,7 @@ avx512_qselect_fp16(uint16_t *arr, arrsize_t index_first_elem = 0; arrsize_t index_last_elem = arrsize - 1; - if (UNLIKELY(hasnan)) { + if (hasnan) [[unlikely]] { if (descending) { index_first_elem = move_nans_to_start_of_array(arr, arrsize); } diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index ffcd85a..ecc2d6c 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -36,86 +36,86 @@ struct zmm_vector { { return X86_SIMD_SORT_MIN_INT32; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_epi32(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_epi32(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmpeq_epi32_mask(x, y); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } template - static halfreg_t i64gather(__m512i index, void const *base) + static X86_SIMD_SORT_FORCE_INLINE halfreg_t i64gather(__m512i index, void const *base) { return _mm512_i64gather_epi32(index, base, scale); } - static reg_t merge(halfreg_t y1, halfreg_t y2) + static X86_SIMD_SORT_FORCE_INLINE reg_t merge(halfreg_t y1, halfreg_t y2) { reg_t z1 = _mm512_castsi256_si512(y1); return _mm512_inserti32x8(z1, y2, 1); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_compressstoreu_epi32(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm512_mask_loadu_epi32(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_mask_mov_epi32(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_epi32(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_min_epi32(x, y); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_max_epi32(x, y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_epi32(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return _mm512_reduce_max_epi32(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return _mm512_reduce_min_epi32(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_epi32(v); } - static regi_t seti(int v1, + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4, @@ -150,45 +150,45 @@ struct zmm_vector { v16); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { return _mm512_storeu_si512(mem, x); } - static halfreg_t max(halfreg_t x, halfreg_t y) + static X86_SIMD_SORT_FORCE_INLINE halfreg_t max(halfreg_t x, halfreg_t y) { return _mm256_max_epi32(x, y); } - static halfreg_t min(halfreg_t x, halfreg_t y) + static X86_SIMD_SORT_FORCE_INLINE halfreg_t min(halfreg_t x, halfreg_t y) { return _mm256_min_epi32(x, y); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { const auto rev_index = _mm512_set_epi32(NETWORK_REVERSE_16LANES); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_16lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return v; } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -223,86 +223,86 @@ struct zmm_vector { { return 0; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_epi32(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_epi32(type_min()); } template - static halfreg_t i64gather(__m512i index, void const *base) + static X86_SIMD_SORT_FORCE_INLINE halfreg_t i64gather(__m512i index, void const *base) { return _mm512_i64gather_epi32(index, base, scale); } - static reg_t merge(halfreg_t y1, halfreg_t y2) + static X86_SIMD_SORT_FORCE_INLINE reg_t merge(halfreg_t y1, halfreg_t y2) { reg_t z1 = _mm512_castsi256_si512(y1); return _mm512_inserti32x8(z1, y2, 1); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmpeq_epu32_mask(x, y); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_max_epu32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_compressstoreu_epi32(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm512_mask_loadu_epi32(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_mask_mov_epi32(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_epi32(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_min_epu32(x, y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_epi32(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return _mm512_reduce_max_epu32(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return _mm512_reduce_min_epu32(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_epi32(v); } - static regi_t seti(int v1, + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4, @@ -337,45 +337,45 @@ struct zmm_vector { v16); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { return _mm512_storeu_si512(mem, x); } - static halfreg_t max(halfreg_t x, halfreg_t y) + static X86_SIMD_SORT_FORCE_INLINE halfreg_t max(halfreg_t x, halfreg_t y) { return _mm256_max_epu32(x, y); } - static halfreg_t min(halfreg_t x, halfreg_t y) + static X86_SIMD_SORT_FORCE_INLINE halfreg_t min(halfreg_t x, halfreg_t y) { return _mm256_min_epu32(x, y); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { const auto rev_index = _mm512_set_epi32(NETWORK_REVERSE_16LANES); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_16lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return v; } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -410,100 +410,100 @@ struct zmm_vector { { return -X86_SIMD_SORT_INFINITYF; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_ps(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_ps(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _mm512_knot(x); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmpeq_ps_mask(x, y); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static int32_t convert_mask_to_int(opmask_t mask) + static X86_SIMD_SORT_FORCE_INLINE int32_t convert_mask_to_int(opmask_t mask) { return mask; } template - static opmask_t fpclass(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t fpclass(reg_t x) { return _mm512_fpclass_ps_mask(x, type); } template - static halfreg_t i64gather(__m512i index, void const *base) + static X86_SIMD_SORT_FORCE_INLINE halfreg_t i64gather(__m512i index, void const *base) { return _mm512_i64gather_ps(index, base, scale); } - static reg_t merge(halfreg_t y1, halfreg_t y2) + static X86_SIMD_SORT_FORCE_INLINE reg_t merge(halfreg_t y1, halfreg_t y2) { reg_t z1 = _mm512_castsi512_ps( _mm512_castsi256_si512(_mm256_castps_si256(y1))); return _mm512_insertf32x8(z1, y2, 1); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_ps(mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_max_ps(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_compressstoreu_ps(mem, mask, x); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm512_maskz_loadu_ps(mask, mem); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm512_mask_loadu_ps(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_mask_mov_ps(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_ps(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_min_ps(x, y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_ps(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return _mm512_reduce_max_ps(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return _mm512_reduce_min_ps(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_ps(v); } - static regi_t seti(int v1, + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4, @@ -538,45 +538,45 @@ struct zmm_vector { v16); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { return _mm512_storeu_ps(mem, x); } - static halfreg_t max(halfreg_t x, halfreg_t y) + static X86_SIMD_SORT_FORCE_INLINE halfreg_t max(halfreg_t x, halfreg_t y) { return _mm256_max_ps(x, y); } - static halfreg_t min(halfreg_t x, halfreg_t y) + static X86_SIMD_SORT_FORCE_INLINE halfreg_t min(halfreg_t x, halfreg_t y) { return _mm256_min_ps(x, y); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { const auto rev_index = _mm512_set_epi32(NETWORK_REVERSE_16LANES); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_16lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return _mm512_castsi512_ps(v); } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return _mm512_castps_si512(v); } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -588,7 +588,8 @@ struct zmm_vector { struct avx512_32bit_swizzle_ops { template - X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t + swap_n(typename vtype::reg_t reg) { __m512i v = vtype::cast_to(reg); @@ -612,7 +613,7 @@ struct avx512_32bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t reverse_n(typename vtype::reg_t reg) { __m512i v = vtype::cast_to(reg); @@ -639,7 +640,7 @@ struct avx512_32bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) { __m512i v1 = vtype::cast_to(reg); diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index f27a31f..4c8f33a 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -10,7 +10,7 @@ #include "avx2-32bit-qsort.hpp" template -X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t zmm); +static X86_SIMD_SORT_FORCE_INLINE reg_t sort_reg_8lanes(reg_t zmm); struct avx512_64bit_swizzle_ops; struct avx512_ymm_64bit_swizzle_ops; @@ -34,16 +34,16 @@ struct ymm_vector { { return -X86_SIMD_SORT_INFINITYF; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm256_set1_ps(type_max()); } - static regi_t + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); } - static reg_t set(type_t v1, + static X86_SIMD_SORT_FORCE_INLINE reg_t set(type_t v1, type_t v2, type_t v3, type_t v4, @@ -54,52 +54,52 @@ struct ymm_vector { { return _mm256_set_ps(v1, v2, v3, v4, v5, v6, v7, v8); } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t kxor_opmask(opmask_t x, opmask_t y) { return _kxor_mask8(x, y); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t le(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t le(reg_t x, reg_t y) { return _mm256_cmp_ps_mask(x, y, _CMP_LE_OQ); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm256_cmp_ps_mask(x, y, _CMP_GE_OQ); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm256_cmp_ps_mask(x, y, _CMP_EQ_OQ); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static int32_t convert_mask_to_int(opmask_t mask) + static X86_SIMD_SORT_FORCE_INLINE int32_t convert_mask_to_int(opmask_t mask) { return mask; } template - static opmask_t fpclass(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t fpclass(reg_t x) { return _mm256_fpclass_ps_mask(x, type); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m512i index, void const *base) { return _mm512_mask_i64gather_ps(src, mask, index, base, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mmask_i32gather_ps(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -110,43 +110,43 @@ struct ymm_vector { arr[ind[1]], arr[ind[0]]); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm256_loadu_ps((float *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm256_max_ps(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm256_mask_compressstoreu_ps(mem, mask, x); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm256_maskz_loadu_ps(mask, mem); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm256_mask_loadu_ps(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm256_mask_mov_ps(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm256_mask_storeu_ps(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm256_min_ps(x, y); } - static reg_t permutexvar(__m256i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m256i idx, reg_t zmm) { return _mm256_permutexvar_ps(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { __m128 v128 = _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1)); @@ -156,7 +156,7 @@ struct ymm_vector { v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); return _mm_cvtss_f32(v32); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { __m128 v128 = _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf32x4_ps(v, 1)); @@ -166,12 +166,12 @@ struct ymm_vector { v64, _mm_shuffle_ps(v64, v64, _MM_SHUFFLE(0, 0, 0, 1))); return _mm_cvtss_f32(v32); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm256_set1_ps(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { /* Hack!: have to make shuffles within 128-bit lanes work for both * 32-bit and 64-bit */ @@ -183,32 +183,32 @@ struct ymm_vector { // return _mm256_shuffle_ps(zmm, zmm, mask); //} } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_8lanes>(x); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm256_storeu_ps((float *)mem, x); } - static reg_t cast_from(__m256i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m256i v) { return _mm256_castsi256_ps(v); } - static __m256i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m256i cast_to(reg_t v) { return _mm256_castps_si256(v); } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES); return permutexvar(rev_index, ymm); } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -236,17 +236,17 @@ struct ymm_vector { { return 0; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm256_set1_epi32(type_max()); } - static regi_t + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); } - static reg_t set(type_t v1, + static X86_SIMD_SORT_FORCE_INLINE reg_t set(type_t v1, type_t v2, type_t v3, type_t v4, @@ -257,43 +257,43 @@ struct ymm_vector { { return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t kxor_opmask(opmask_t x, opmask_t y) { return _kxor_mask8(x, y); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t le(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t le(reg_t x, reg_t y) { return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_LE); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_NLT); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_EQ); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m512i index, void const *base) { return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mmask_i32gather_epi32(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -304,43 +304,43 @@ struct ymm_vector { arr[ind[1]], arr[ind[0]]); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm256_loadu_si256((__m256i *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm256_max_epu32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm256_mask_compressstoreu_epi32(mem, mask, x); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm256_maskz_loadu_epi32(mask, mem); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm256_mask_loadu_epi32(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm256_mask_mov_epi32(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm256_mask_storeu_epi32(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm256_min_epu32(x, y); } - static reg_t permutexvar(__m256i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m256i idx, reg_t zmm) { return _mm256_permutexvar_epi32(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { __m128i v128 = _mm_max_epu32(_mm256_castsi256_si128(v), _mm256_extracti128_si256(v, 1)); @@ -350,7 +350,7 @@ struct ymm_vector { v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); return (type_t)_mm_cvtsi128_si32(v32); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { __m128i v128 = _mm_min_epu32(_mm256_castsi256_si128(v), _mm256_extracti128_si256(v, 1)); @@ -360,43 +360,43 @@ struct ymm_vector { v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); return (type_t)_mm_cvtsi128_si32(v32); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm256_set1_epi32(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { /* Hack!: have to make shuffles within 128-bit lanes work for both * 32-bit and 64-bit */ return _mm256_shuffle_epi32(zmm, 0b10110001); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_8lanes>(x); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); } - static reg_t cast_from(__m256i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m256i v) { return v; } - static __m256i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m256i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES); return permutexvar(rev_index, ymm); } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -424,17 +424,17 @@ struct ymm_vector { { return X86_SIMD_SORT_MIN_INT32; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm256_set1_epi32(type_max()); } // TODO: this should broadcast bits as is? - static regi_t + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); } - static reg_t set(type_t v1, + static X86_SIMD_SORT_FORCE_INLINE reg_t set(type_t v1, type_t v2, type_t v3, type_t v4, @@ -445,43 +445,43 @@ struct ymm_vector { { return _mm256_set_epi32(v1, v2, v3, v4, v5, v6, v7, v8); } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t kxor_opmask(opmask_t x, opmask_t y) { return _kxor_mask8(x, y); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t le(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t le(reg_t x, reg_t y) { return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_LE); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_NLT); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_EQ); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m512i index, void const *base) { return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm256_mmask_i32gather_epi32(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -492,43 +492,43 @@ struct ymm_vector { arr[ind[1]], arr[ind[0]]); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm256_loadu_si256((__m256i *)mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm256_max_epi32(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm256_mask_compressstoreu_epi32(mem, mask, x); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm256_maskz_loadu_epi32(mask, mem); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm256_mask_loadu_epi32(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm256_mask_mov_epi32(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm256_mask_storeu_epi32(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm256_min_epi32(x, y); } - static reg_t permutexvar(__m256i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m256i idx, reg_t zmm) { return _mm256_permutexvar_epi32(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { __m128i v128 = _mm_max_epi32(_mm256_castsi256_si128(v), _mm256_extracti128_si256(v, 1)); @@ -538,7 +538,7 @@ struct ymm_vector { v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); return (type_t)_mm_cvtsi128_si32(v32); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { __m128i v128 = _mm_min_epi32(_mm256_castsi256_si128(v), _mm256_extracti128_si256(v, 1)); @@ -548,43 +548,43 @@ struct ymm_vector { v64, _mm_shuffle_epi32(v64, _MM_SHUFFLE(0, 0, 0, 1))); return (type_t)_mm_cvtsi128_si32(v32); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm256_set1_epi32(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { /* Hack!: have to make shuffles within 128-bit lanes work for both * 32-bit and 64-bit */ return _mm256_shuffle_epi32(zmm, 0b10110001); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_8lanes>(x); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); } - static reg_t cast_from(__m256i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m256i v) { return v; } - static __m256i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m256i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static reg_t reverse(reg_t ymm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES); return permutexvar(rev_index, ymm); } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -619,21 +619,21 @@ struct zmm_vector { { return X86_SIMD_SORT_MIN_INT64; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_epi64(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_epi64(type_min()); } - static regi_t + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); } - static reg_t set(type_t v1, + static X86_SIMD_SORT_FORCE_INLINE reg_t set(type_t v1, type_t v2, type_t v3, type_t v4, @@ -644,43 +644,43 @@ struct zmm_vector { { return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); } - static opmask_t kxor_opmask(opmask_t x, opmask_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t kxor_opmask(opmask_t x, opmask_t y) { return _kxor_mask8(x, y); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t le(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t le(reg_t x, reg_t y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_LE); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m512i index, void const *base) { return _mm512_mask_i64gather_epi64(src, mask, index, base, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm512_mask_i32gather_epi64(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -691,87 +691,87 @@ struct zmm_vector { arr[ind[1]], arr[ind[0]]); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_max_epi64(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_compressstoreu_epi64(mem, mask, x); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm512_maskz_loadu_epi64(mask, mem); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm512_mask_loadu_epi64(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_mask_mov_epi64(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_epi64(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_min_epi64(x, y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_epi64(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return _mm512_reduce_max_epi64(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return _mm512_reduce_min_epi64(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_epi64(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { __m512d temp = _mm512_castsi512_pd(zmm); return _mm512_castpd_si512( _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask)); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm512_storeu_si512(mem, x); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { const regi_t rev_index = seti(NETWORK_REVERSE_8LANES); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_8lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return v; } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -806,21 +806,21 @@ struct zmm_vector { { return 0; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_epi64(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_epi64(type_min()); } - static regi_t + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); } - static reg_t set(type_t v1, + static X86_SIMD_SORT_FORCE_INLINE reg_t set(type_t v1, type_t v2, type_t v3, type_t v4, @@ -832,18 +832,18 @@ struct zmm_vector { return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m512i index, void const *base) { return _mm512_mask_i64gather_epi64(src, mask, index, base, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm512_mask_i32gather_epi64(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -854,103 +854,103 @@ struct zmm_vector { arr[ind[1]], arr[ind[0]]); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_EQ); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_si512(mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_max_epu64(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_compressstoreu_epi64(mem, mask, x); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm512_maskz_loadu_epi64(mask, mem); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm512_mask_loadu_epi64(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_mask_mov_epi64(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_epi64(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_min_epu64(x, y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_epi64(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return _mm512_reduce_max_epu64(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return _mm512_reduce_min_epu64(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_epi64(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { __m512d temp = _mm512_castsi512_pd(zmm); return _mm512_castpd_si512( _mm512_shuffle_pd(temp, temp, (_MM_PERM_ENUM)mask)); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm512_storeu_si512(mem, x); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { const regi_t rev_index = seti(NETWORK_REVERSE_8LANES); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_8lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return v; } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return v; } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -997,20 +997,20 @@ struct zmm_vector { { return -X86_SIMD_SORT_INFINITY; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_pd(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_pd(type_min()); } - static regi_t + static X86_SIMD_SORT_FORCE_INLINE regi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { return _mm512_set_epi64(v1, v2, v3, v4, v5, v6, v7, v8); } - static reg_t set(type_t v1, + static X86_SIMD_SORT_FORCE_INLINE reg_t set(type_t v1, type_t v2, type_t v3, type_t v4, @@ -1021,48 +1021,48 @@ struct zmm_vector { { return _mm512_set_pd(v1, v2, v3, v4, v5, v6, v7, v8); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm512_maskz_loadu_pd(mask, mem); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask8(x); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static int32_t convert_mask_to_int(opmask_t mask) + static X86_SIMD_SORT_FORCE_INLINE int32_t convert_mask_to_int(opmask_t mask) { return mask; } template - static opmask_t fpclass(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t fpclass(reg_t x) { return _mm512_fpclass_pd_mask(x, type); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m512i index, void const *base) { return _mm512_mask_i64gather_pd(src, mask, index, base, scale); } template - static reg_t + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_i64gather(reg_t src, opmask_t mask, __m256i index, void const *base) { return _mm512_mask_i32gather_pd(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, arrsize_t *ind) + static X86_SIMD_SORT_FORCE_INLINE reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -1073,81 +1073,81 @@ struct zmm_vector { arr[ind[1]], arr[ind[0]]); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_pd(mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_max_pd(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_compressstoreu_pd(mem, mask, x); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm512_mask_loadu_pd(x, mask, mem); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_mask_mov_pd(x, mask, y); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_pd(mem, mask, x); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_min_pd(x, y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_pd(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return _mm512_reduce_max_pd(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return _mm512_reduce_min_pd(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_pd(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { return _mm512_shuffle_pd(zmm, zmm, (_MM_PERM_ENUM)mask); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { _mm512_storeu_pd(mem, x); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { const regi_t rev_index = seti(NETWORK_REVERSE_8LANES); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_8lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return _mm512_castsi512_pd(v); } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return _mm512_castpd_si512(v); } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) @@ -1159,7 +1159,7 @@ struct zmm_vector { struct avx512_64bit_swizzle_ops { template - X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) { __m512i v = vtype::cast_to(reg); @@ -1180,7 +1180,7 @@ struct avx512_64bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t reverse_n(typename vtype::reg_t reg) { __m512i v = vtype::cast_to(reg); @@ -1201,7 +1201,7 @@ struct avx512_64bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) { __m512i v1 = vtype::cast_to(reg); @@ -1226,7 +1226,7 @@ struct avx512_64bit_swizzle_ops { struct avx512_ymm_64bit_swizzle_ops { template - X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) { __m256i v = vtype::cast_to(reg); @@ -1251,7 +1251,7 @@ struct avx512_ymm_64bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t reverse_n(typename vtype::reg_t reg) { __m256i v = vtype::cast_to(reg); @@ -1274,7 +1274,7 @@ struct avx512_ymm_64bit_swizzle_ops { } template - X86_SIMD_SORT_INLINE typename vtype::reg_t + static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) { __m256i v1 = vtype::cast_to(reg); diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 8f85e59..8aa369e 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -39,127 +39,127 @@ struct zmm_vector<_Float16> { val.i_ = X86_SIMD_SORT_NEGINFINITYH; return val.f_; } - static reg_t zmm_max() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_max() { return _mm512_set1_ph(type_max()); } - static reg_t zmm_min() + static X86_SIMD_SORT_FORCE_INLINE reg_t zmm_min() { return _mm512_set1_ph(type_min()); } - static opmask_t knot_opmask(opmask_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t knot_opmask(opmask_t x) { return _knot_mask32(x); } - static opmask_t ge(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t ge(reg_t x, reg_t y) { return _mm512_cmp_ph_mask(x, y, _CMP_GE_OQ); } - static opmask_t eq(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE opmask_t eq(reg_t x, reg_t y) { return _mm512_cmp_ph_mask(x, y, _CMP_EQ_OQ); } - static opmask_t get_partial_loadmask(uint64_t num_to_read) + static X86_SIMD_SORT_FORCE_INLINE opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); } - static int32_t convert_mask_to_int(opmask_t mask) + static X86_SIMD_SORT_FORCE_INLINE int32_t convert_mask_to_int(opmask_t mask) { return mask; } template - static opmask_t fpclass(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE opmask_t fpclass(reg_t x) { return _mm512_fpclass_ph_mask(x, type); } - static reg_t loadu(void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t loadu(void const *mem) { return _mm512_loadu_ph(mem); } - static reg_t max(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t max(reg_t x, reg_t y) { return _mm512_max_ph(x, y); } - static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) { __m512i temp = _mm512_castph_si512(x); // AVX512_VBMI2 return _mm512_mask_compressstoreu_epi16(mem, mask, temp); } - static reg_t maskz_loadu(opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t maskz_loadu(opmask_t mask, void const *mem) { return _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, mem)); } - static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { // AVX512BW return _mm512_castsi512_ph( _mm512_mask_loadu_epi16(_mm512_castph_si512(x), mask, mem)); } - static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) { return _mm512_castsi512_ph(_mm512_mask_mov_epi16( _mm512_castph_si512(x), mask, _mm512_castph_si512(y))); } - static void mask_storeu(void *mem, opmask_t mask, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void mask_storeu(void *mem, opmask_t mask, reg_t x) { return _mm512_mask_storeu_epi16(mem, mask, _mm512_castph_si512(x)); } - static reg_t min(reg_t x, reg_t y) + static X86_SIMD_SORT_FORCE_INLINE reg_t min(reg_t x, reg_t y) { return _mm512_min_ph(x, y); } - static reg_t permutexvar(__m512i idx, reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t permutexvar(__m512i idx, reg_t zmm) { return _mm512_permutexvar_ph(idx, zmm); } - static type_t reducemax(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemax(reg_t v) { return _mm512_reduce_max_ph(v); } - static type_t reducemin(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE type_t reducemin(reg_t v) { return _mm512_reduce_min_ph(v); } - static reg_t set1(type_t v) + static X86_SIMD_SORT_FORCE_INLINE reg_t set1(type_t v) { return _mm512_set1_ph(v); } template - static reg_t shuffle(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t shuffle(reg_t zmm) { __m512i temp = _mm512_shufflehi_epi16(_mm512_castph_si512(zmm), (_MM_PERM_ENUM)mask); return _mm512_castsi512_ph( _mm512_shufflelo_epi16(temp, (_MM_PERM_ENUM)mask)); } - static void storeu(void *mem, reg_t x) + static X86_SIMD_SORT_FORCE_INLINE void storeu(void *mem, reg_t x) { return _mm512_storeu_ph(mem, x); } - static reg_t reverse(reg_t zmm) + static X86_SIMD_SORT_FORCE_INLINE reg_t reverse(reg_t zmm) { constexpr static uint16_t arr[] = {NETWORK_REVERSE_32LANES}; const auto rev_index = _mm512_loadu_si512(arr); return permutexvar(rev_index, zmm); } - static reg_t sort_vec(reg_t x) + static X86_SIMD_SORT_FORCE_INLINE reg_t sort_vec(reg_t x) { return sort_reg_32lanes>(x); } - static reg_t cast_from(__m512i v) + static X86_SIMD_SORT_FORCE_INLINE reg_t cast_from(__m512i v) { return _mm512_castsi512_ph(v); } - static __m512i cast_to(reg_t v) + static X86_SIMD_SORT_FORCE_INLINE __m512i cast_to(reg_t v) { return _mm512_castph_si512(v); } - static bool all_false(opmask_t k) + static X86_SIMD_SORT_FORCE_INLINE bool all_false(opmask_t k) { return k == 0; } - static int double_compressstore(type_t *left_addr, + static X86_SIMD_SORT_FORCE_INLINE int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, reg_t reg) diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index 6c071c2..4c1ae19 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -72,7 +72,7 @@ template -X86_SIMD_SORT_INLINE int32_t partition_vec_avx512(type_t *arg, +X86_SIMD_SORT_FINLINE int32_t partition_vec_avx512(type_t *arg, arrsize_t left, arrsize_t right, const argreg_t arg_vec, @@ -102,7 +102,7 @@ template -X86_SIMD_SORT_INLINE int32_t partition_vec_avx2(type_t *arg, +X86_SIMD_SORT_FINLINE int32_t partition_vec_avx2(type_t *arg, arrsize_t left, arrsize_t right, const argreg_t arg_vec, @@ -133,7 +133,7 @@ template -X86_SIMD_SORT_INLINE int32_t partition_vec(type_t *arg, +X86_SIMD_SORT_FINLINE int32_t partition_vec(type_t *arg, arrsize_t left, arrsize_t right, const argreg_t arg_vec, diff --git a/src/xss-common-comparators.hpp b/src/xss-common-comparators.hpp index bd742cd..5e0080e 100644 --- a/src/xss-common-comparators.hpp +++ b/src/xss-common-comparators.hpp @@ -32,7 +32,7 @@ type_t next_value(type_t value) } template -X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); +X86_SIMD_SORT_FINLINE void COEX(mm_t &a, mm_t &b); template struct Comparator { diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index c36f7db..1cd26a2 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -42,28 +42,15 @@ #define X86_SIMD_SORT_INLINE_ONLY inline #define X86_SIMD_SORT_INLINE static inline #define X86_SIMD_SORT_FINLINE static __forceinline -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#elif defined(__CYGWIN__) -/* - * Force inline in cygwin to work around a compiler bug. See - * https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584 - */ -#define X86_SIMD_SORT_INLINE_ONLY inline -#define X86_SIMD_SORT_INLINE static __attribute__((always_inline)) -#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline)) #elif defined(__GNUC__) #define X86_SIMD_SORT_INLINE_ONLY inline #define X86_SIMD_SORT_INLINE static inline -#define X86_SIMD_SORT_FINLINE static inline __attribute__((always_inline)) -#define LIKELY(x) __builtin_expect((x), 1) -#define UNLIKELY(x) __builtin_expect((x), 0) +#define X86_SIMD_SORT_FORCE_INLINE inline __attribute__((always_inline)) +#define X86_SIMD_SORT_FINLINE static X86_SIMD_SORT_FORCE_INLINE #else #define X86_SIMD_SORT_INLINE_ONLY #define X86_SIMD_SORT_INLINE static #define X86_SIMD_SORT_FINLINE static -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) #endif #if defined(__INTEL_COMPILER) and !defined(__SANITIZE_ADDRESS__) @@ -114,7 +101,7 @@ struct avx2_half_vector; enum class simd_type : int { AVX2, AVX512 }; template -X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b); +X86_SIMD_SORT_FINLINE bool comparison_func(const T &a, const T &b); struct float16 { uint16_t val; diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp index 3a07e01..133da42 100644 --- a/src/xss-common-keyvaluesort.hpp +++ b/src/xss-common-keyvaluesort.hpp @@ -78,7 +78,7 @@ template -X86_SIMD_SORT_INLINE int32_t partition_vec(type_t1 *keys, +X86_SIMD_SORT_FINLINE int32_t partition_vec(type_t1 *keys, type_t2 *indexes, arrsize_t left, arrsize_t right, @@ -112,7 +112,7 @@ template -X86_SIMD_SORT_INLINE arrsize_t kvpartition(type_t1 *keys, +X86_SIMD_SORT_FINLINE arrsize_t kvpartition(type_t1 *keys, type_t2 *indexes, arrsize_t left, arrsize_t right, @@ -244,7 +244,7 @@ template -X86_SIMD_SORT_INLINE arrsize_t kvpartition_unrolled(type_t1 *keys, +X86_SIMD_SORT_FINLINE arrsize_t kvpartition_unrolled(type_t1 *keys, type_t2 *indexes, arrsize_t left, arrsize_t right, @@ -595,7 +595,7 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv( if (minarrsize) { arrsize_t index_last_elem = arrsize - 1; if constexpr (xss::fp::is_floating_point_v) { - if (UNLIKELY(hasnan)) { + if (hasnan) [[unlikely]] { index_last_elem = move_nans_to_end_of_array>( keys, indexes, arrsize); @@ -692,7 +692,7 @@ X86_SIMD_SORT_INLINE void xss_select_kv(T1 *keys, arrsize_t index_last_elem = arrsize - 1; if constexpr (xss::fp::is_floating_point_v) { - if (UNLIKELY(hasnan)) { + if (hasnan) [[unlikely]] { index_last_elem = move_nans_to_end_of_array>( keys, indexes, arrsize); diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 73b947a..edb02da 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -180,7 +180,7 @@ X86_SIMD_SORT_INLINE arrsize_t move_nans_to_start_of_array(T *arr, } template -X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b) +X86_SIMD_SORT_FINLINE bool comparison_func(const T &a, const T &b) { return a < b; } @@ -189,7 +189,7 @@ X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b) * COEX == Compare and Exchange two registers by swapping min and max values */ template -X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b) +X86_SIMD_SORT_FINLINE void COEX(mm_t &a, mm_t &b) { mm_t temp = a; a = vtype::min(a, b); @@ -199,7 +199,7 @@ X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b) template -X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) +X86_SIMD_SORT_FINLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) { reg_t min = vtype::min(in2, in1); reg_t max = vtype::max(in2, in1); @@ -207,7 +207,7 @@ X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) } template -int avx512_double_compressstore(type_t *left_addr, +X86_SIMD_SORT_FINLINE int avx512_double_compressstore(type_t *left_addr, type_t *right_addr, typename vtype::opmask_t k, reg_t reg) @@ -226,7 +226,7 @@ template -X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store, +X86_SIMD_SORT_FINLINE arrsize_t partition_vec(type_t *l_store, type_t *r_store, const reg_t curr_vec, const reg_t pivot_vec, @@ -660,7 +660,7 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) if (arrsize > 1) { arrsize_t nan_count = 0; if constexpr (xss::fp::is_floating_point_v) { - if (UNLIKELY(hasnan)) { + if (hasnan) [[unlikely]] { nan_count = replace_nan_with_inf(arr, arrsize); } } @@ -726,7 +726,7 @@ xss_qselect(T *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) arrsize_t index_last_elem = arrsize - 1; if constexpr (xss::fp::is_floating_point_v) { - if (UNLIKELY(hasnan)) { + if (hasnan) [[unlikely]] { if constexpr (descending) { index_first_elem = move_nans_to_start_of_array(arr, arrsize); } diff --git a/src/xss-network-keyvaluesort.hpp b/src/xss-network-keyvaluesort.hpp index 771d7a3..a2772d4 100644 --- a/src/xss-network-keyvaluesort.hpp +++ b/src/xss-network-keyvaluesort.hpp @@ -4,7 +4,7 @@ #include "xss-common-includes.h" template -typename vtype::opmask_t convert_int_to_mask(maskType mask) +X86_SIMD_SORT_FORCE_INLINE typename vtype::opmask_t convert_int_to_mask(maskType mask) { if constexpr (vtype::vec_type == simd_type::AVX512) { return mask; } else if constexpr (vtype::vec_type == simd_type::AVX2) { @@ -17,7 +17,7 @@ typename vtype::opmask_t convert_int_to_mask(maskType mask) } template -typename valueType::opmask_t resize_mask(typename keyType::opmask_t mask) +X86_SIMD_SORT_FORCE_INLINE typename valueType::opmask_t resize_mask(typename keyType::opmask_t mask) { using inT = typename keyType::opmask_t; using outT = typename valueType::opmask_t; @@ -42,7 +42,7 @@ template -X86_SIMD_SORT_INLINE void +X86_SIMD_SORT_FINLINE void COEX(reg_t1 &key1, reg_t1 &key2, reg_t2 &index1, reg_t2 &index2) { reg_t1 key_t1 = vtype1::min(key1, key2); @@ -64,7 +64,7 @@ template -X86_SIMD_SORT_INLINE reg_t1 cmp_merge(reg_t1 in1, +X86_SIMD_SORT_FINLINE reg_t1 cmp_merge(reg_t1 in1, reg_t1 in2, reg_t2 &indexes1, reg_t2 indexes2, @@ -79,7 +79,7 @@ X86_SIMD_SORT_INLINE reg_t1 cmp_merge(reg_t1 in1, } template -X86_SIMD_SORT_INLINE void +X86_SIMD_SORT_FINLINE void bitonic_merge_dispatch(typename keyType::reg_t &key, typename valueType::reg_t &value) { @@ -102,7 +102,7 @@ bitonic_merge_dispatch(typename keyType::reg_t &key, } template -X86_SIMD_SORT_INLINE void sort_vec_dispatch(typename keyType::reg_t &key, +X86_SIMD_SORT_FINLINE void sort_vec_dispatch(typename keyType::reg_t &key, typename valueType::reg_t &value) { constexpr int numlanes = keyType::numlanes; diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp index c09dfc6..9b8027b 100644 --- a/src/xss-pivot-selection.hpp +++ b/src/xss-pivot-selection.hpp @@ -21,7 +21,7 @@ struct pivot_results { }; template -X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); +X86_SIMD_SORT_FINLINE void COEX(mm_t &a, mm_t &b); template X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, diff --git a/src/xss-reg-networks.hpp b/src/xss-reg-networks.hpp index 727cccf..070e4d0 100644 --- a/src/xss-reg-networks.hpp +++ b/src/xss-reg-networks.hpp @@ -7,14 +7,14 @@ template typename vtype::opmask_t convert_int_to_mask(maskType mask); template -X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask); +X86_SIMD_SORT_FINLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask); template -X86_SIMD_SORT_INLINE reg_t1 cmp_merge(reg_t1 in1, +X86_SIMD_SORT_FINLINE reg_t1 cmp_merge(reg_t1 in1, reg_t1 in2, reg_t2 &indexes1, reg_t2 indexes2, @@ -27,7 +27,7 @@ X86_SIMD_SORT_INLINE reg_t1 cmp_merge(reg_t1 in1, * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -X86_SIMD_SORT_INLINE reg_t sort_reg_4lanes(reg_t reg) +static X86_SIMD_SORT_FORCE_INLINE reg_t sort_reg_4lanes(reg_t reg) { using swizzle = typename vtype::swizzle_ops; @@ -47,7 +47,7 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_4lanes(reg_t reg) * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t reg) +static X86_SIMD_SORT_FORCE_INLINE reg_t sort_reg_8lanes(reg_t reg) { using swizzle = typename vtype::swizzle_ops; @@ -72,7 +72,7 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t reg) * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t reg) +static X86_SIMD_SORT_FORCE_INLINE reg_t sort_reg_16lanes(reg_t reg) { using swizzle = typename vtype::swizzle_ops; @@ -109,7 +109,7 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t reg) * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -X86_SIMD_SORT_INLINE reg_t sort_reg_32lanes(reg_t reg) +static X86_SIMD_SORT_FORCE_INLINE reg_t sort_reg_32lanes(reg_t reg) { using swizzle = typename vtype::swizzle_ops; @@ -168,7 +168,7 @@ template -X86_SIMD_SORT_INLINE reg_t sort_reg_4lanes(reg_t key_reg, index_type &index_reg) +static X86_SIMD_SORT_FORCE_INLINE reg_t sort_reg_4lanes(reg_t key_reg, index_type &index_reg) { using key_swizzle = typename vtype1::swizzle_ops; using index_swizzle = typename vtype2::swizzle_ops; @@ -201,7 +201,7 @@ template -X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t key_reg, index_type &index_reg) +static X86_SIMD_SORT_FORCE_INLINE reg_t sort_reg_8lanes(reg_t key_reg, index_type &index_reg) { using key_swizzle = typename vtype1::swizzle_ops; using index_swizzle = typename vtype2::swizzle_ops; @@ -253,7 +253,7 @@ template -X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t key_reg, +static X86_SIMD_SORT_FORCE_INLINE reg_t sort_reg_16lanes(reg_t key_reg, index_type &index_reg) { using key_swizzle = typename vtype1::swizzle_ops; @@ -332,7 +332,7 @@ template -X86_SIMD_SORT_INLINE reg_t bitonic_merge_reg_4lanes(reg_t key_reg, +static X86_SIMD_SORT_FORCE_INLINE reg_t bitonic_merge_reg_4lanes(reg_t key_reg, index_type &index_reg) { using key_swizzle = typename vtype1::swizzle_ops; @@ -363,7 +363,7 @@ template -X86_SIMD_SORT_INLINE reg_t bitonic_merge_reg_8lanes(reg_t key_reg, +static X86_SIMD_SORT_FORCE_INLINE reg_t bitonic_merge_reg_8lanes(reg_t key_reg, index_type &index_reg) { using key_swizzle = typename vtype1::swizzle_ops; @@ -402,7 +402,7 @@ template -X86_SIMD_SORT_INLINE reg_t bitonic_merge_reg_16lanes(reg_t key_reg, +static X86_SIMD_SORT_FORCE_INLINE reg_t bitonic_merge_reg_16lanes(reg_t key_reg, index_type &index_reg) { using key_swizzle = typename vtype1::swizzle_ops;