numpy
diff --git a/‎src/avx512-16bit-common.h‎
Lines changed: 15 additions & 0 deletions b/‎src/avx512-16bit-common.h‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/avx512-16bit-qsort.hpp‎
Lines changed: 340 additions & 0 deletions b/‎src/avx512-16bit-qsort.hpp‎
Lines changed: 340 additions & 0 deletions
@@ -14,6 +14,21 @@
  * sorting network (see
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
  */
+// ZMM register: 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+static const uint16_t network[6][32]
+        = {{7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13, 12, 11, 10, 9,  8,
+            23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24},
+           {15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
+            31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16},
+           {4,  5,  6,  7,  0,  1,  2,  3,  12, 13, 14, 15, 8,  9,  10, 11,
+            20, 21, 22, 23, 16, 17, 18, 19, 28, 29, 30, 31, 24, 25, 26, 27},
+           {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+            15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0},
+           {8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,
+            24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23},
+           {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}};
+
 /*
  * Assumes zmm is random and performs a full sorting network defined in
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
 
@@ -9,6 +9,346 @@
 
 #include "avx512-16bit-common.h"
 
+struct float16 {
+    uint16_t val;
+};
+
+template <>
+struct zmm_vector<float16> {
+    using type_t = uint16_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask32;
+    static const uint8_t numlanes = 32;
+
+    static zmm_t get_network(int index)
+    {
+        return _mm512_loadu_si512(&network[index - 1][0]);
+    }
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_INFINITYH;
+    }
+    static type_t type_min()
+    {
+        return X86_SIMD_SORT_NEGINFINITYH;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi16(type_max());
+    }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask32(x);
+    }
+
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        zmm_t sign_x = _mm512_and_si512(x, _mm512_set1_epi16(0x8000));
+        zmm_t sign_y = _mm512_and_si512(y, _mm512_set1_epi16(0x8000));
+        zmm_t exp_x = _mm512_and_si512(x, _mm512_set1_epi16(0x7c00));
+        zmm_t exp_y = _mm512_and_si512(y, _mm512_set1_epi16(0x7c00));
+        zmm_t mant_x = _mm512_and_si512(x, _mm512_set1_epi16(0x3ff));
+        zmm_t mant_y = _mm512_and_si512(y, _mm512_set1_epi16(0x3ff));
+
+        __mmask32 mask_ge = _mm512_cmp_epu16_mask(
+                sign_x, sign_y, _MM_CMPINT_LT); // only greater than
+        __mmask32 sign_eq = _mm512_cmpeq_epu16_mask(sign_x, sign_y);
+        __mmask32 neg = _mm512_mask_cmpeq_epu16_mask(
+                sign_eq,
+                sign_x,
+                _mm512_set1_epi16(0x8000)); // both numbers are -ve
+
+        // compare exponents only if signs are equal:
+        mask_ge = mask_ge
+                | _mm512_mask_cmp_epu16_mask(
+                          sign_eq, exp_x, exp_y, _MM_CMPINT_NLE);
+        // get mask for elements for which both sign and exponents are equal:
+        __mmask32 exp_eq = _mm512_mask_cmpeq_epu16_mask(sign_eq, exp_x, exp_y);
+
+        // compare mantissa for elements for which both sign and expponent are equal:
+        mask_ge = mask_ge
+                | _mm512_mask_cmp_epu16_mask(
+                          exp_eq, mant_x, mant_y, _MM_CMPINT_NLT);
+        return _kxor_mask32(mask_ge, neg);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(y, ge(x, y), x);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        // AVX512_VBMI2
+        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        // AVX512BW
+        return _mm512_mask_loadu_epi16(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi16(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(x, ge(x, y), y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi16(idx, zmm);
+    }
+    // Apparently this is a terrible for perf, npy_half_to_float seems to work
+    // better
+    //static float uint16_to_float(uint16_t val)
+    //{
+    //    // Ideally use _mm_loadu_si16, but its only gcc > 11.x
+    //    // TODO: use inline ASM? https://godbolt.org/z/aGYvh7fMM
+    //    __m128i xmm = _mm_maskz_loadu_epi16(0x01, &val);
+    //    __m128 xmm2 = _mm_cvtph_ps(xmm);
+    //    return _mm_cvtss_f32(xmm2);
+    //}
+    static type_t float_to_uint16(float val)
+    {
+        __m128 xmm = _mm_load_ss(&val);
+        __m128i xmm2 = _mm_cvtps_ph(xmm, _MM_FROUND_NO_EXC);
+        return _mm_extract_epi16(xmm2, 0);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0));
+        __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1));
+        float lo_max = _mm512_reduce_max_ps(lo);
+        float hi_max = _mm512_reduce_max_ps(hi);
+        return float_to_uint16(std::max(lo_max, hi_max));
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0));
+        __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1));
+        float lo_max = _mm512_reduce_min_ps(lo);
+        float hi_max = _mm512_reduce_min_ps(hi);
+        return float_to_uint16(std::min(lo_max, hi_max));
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi16(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
+        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+};
+
+template <>
+struct zmm_vector<int16_t> {
+    using type_t = int16_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask32;
+    static const uint8_t numlanes = 32;
+
+    static zmm_t get_network(int index)
+    {
+        return _mm512_loadu_si512(&network[index - 1][0]);
+    }
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_MAX_INT16;
+    }
+    static type_t type_min()
+    {
+        return X86_SIMD_SORT_MIN_INT16;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi16(type_max());
+    }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask32(x);
+    }
+
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_epi16(x, y);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        // AVX512_VBMI2
+        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        // AVX512BW
+        return _mm512_mask_loadu_epi16(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi16(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_epi16(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi16(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
+        zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
+        type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo);
+        type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi);
+        return std::max(lo_max, hi_max);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
+        zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
+        type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo);
+        type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi);
+        return std::min(lo_min, hi_min);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi16(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
+        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+};
+template <>
+struct zmm_vector<uint16_t> {
+    using type_t = uint16_t;
+    using zmm_t = __m512i;
+    using ymm_t = __m256i;
+    using opmask_t = __mmask32;
+    static const uint8_t numlanes = 32;
+
+    static zmm_t get_network(int index)
+    {
+        return _mm512_loadu_si512(&network[index - 1][0]);
+    }
+    static type_t type_max()
+    {
+        return X86_SIMD_SORT_MAX_UINT16;
+    }
+    static type_t type_min()
+    {
+        return 0;
+    }
+    static zmm_t zmm_max()
+    {
+        return _mm512_set1_epi16(type_max());
+    }
+
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        return _knot_mask32(x);
+    }
+    static opmask_t ge(zmm_t x, zmm_t y)
+    {
+        return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT);
+    }
+    static zmm_t loadu(void const *mem)
+    {
+        return _mm512_loadu_si512(mem);
+    }
+    static zmm_t max(zmm_t x, zmm_t y)
+    {
+        return _mm512_max_epu16(x, y);
+    }
+    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
+    }
+    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
+    {
+        return _mm512_mask_loadu_epi16(x, mask, mem);
+    }
+    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
+    {
+        return _mm512_mask_mov_epi16(x, mask, y);
+    }
+    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
+    {
+        return _mm512_mask_storeu_epi16(mem, mask, x);
+    }
+    static zmm_t min(zmm_t x, zmm_t y)
+    {
+        return _mm512_min_epu16(x, y);
+    }
+    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
+    {
+        return _mm512_permutexvar_epi16(idx, zmm);
+    }
+    static type_t reducemax(zmm_t v)
+    {
+        zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
+        zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
+        type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo);
+        type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi);
+        return std::max(lo_max, hi_max);
+    }
+    static type_t reducemin(zmm_t v)
+    {
+        zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
+        zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
+        type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo);
+        type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi);
+        return std::min(lo_min, hi_min);
+    }
+    static zmm_t set1(type_t v)
+    {
+        return _mm512_set1_epi16(v);
+    }
+    template <uint8_t mask>
+    static zmm_t shuffle(zmm_t zmm)
+    {
+        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
+        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
+    }
+    static void storeu(void *mem, zmm_t x)
+    {
+        return _mm512_storeu_si512(mem, x);
+    }
+};
+
 template <>
 bool comparison_func<zmm_vector<float16>>(const uint16_t &a, const uint16_t &b)
 {