numpy
diff --git a/‎src/avx512-16bit-common.h‎
Lines changed: 0 additions & 15 deletions b/‎src/avx512-16bit-common.h‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎src/avx512-16bit-qsort.hpp‎
Lines changed: 0 additions & 340 deletions b/‎src/avx512-16bit-qsort.hpp‎
Lines changed: 0 additions & 340 deletions
@@ -14,21 +14,6 @@
  * sorting network (see
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
  */
-// ZMM register: 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-static const uint16_t network[6][32]
-        = {{7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13, 12, 11, 10, 9,  8,
-            23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24},
-           {15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
-            31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16},
-           {4,  5,  6,  7,  0,  1,  2,  3,  12, 13, 14, 15, 8,  9,  10, 11,
-            20, 21, 22, 23, 16, 17, 18, 19, 28, 29, 30, 31, 24, 25, 26, 27},
-           {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
-            15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0},
-           {8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,
-            24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23},
-           {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}};
-
 /*
  * Assumes zmm is random and performs a full sorting network defined in
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
 
@@ -9,346 +9,6 @@
 
 #include "avx512-16bit-common.h"
 
-struct float16 {
-    uint16_t val;
-};
-
-template <>
-struct zmm_vector<float16> {
-    using type_t = uint16_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-    using opmask_t = __mmask32;
-    static const uint8_t numlanes = 32;
-
-    static zmm_t get_network(int index)
-    {
-        return _mm512_loadu_si512(&network[index - 1][0]);
-    }
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_INFINITYH;
-    }
-    static type_t type_min()
-    {
-        return X86_SIMD_SORT_NEGINFINITYH;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi16(type_max());
-    }
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return _knot_mask32(x);
-    }
-
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        zmm_t sign_x = _mm512_and_si512(x, _mm512_set1_epi16(0x8000));
-        zmm_t sign_y = _mm512_and_si512(y, _mm512_set1_epi16(0x8000));
-        zmm_t exp_x = _mm512_and_si512(x, _mm512_set1_epi16(0x7c00));
-        zmm_t exp_y = _mm512_and_si512(y, _mm512_set1_epi16(0x7c00));
-        zmm_t mant_x = _mm512_and_si512(x, _mm512_set1_epi16(0x3ff));
-        zmm_t mant_y = _mm512_and_si512(y, _mm512_set1_epi16(0x3ff));
-
-        __mmask32 mask_ge = _mm512_cmp_epu16_mask(
-                sign_x, sign_y, _MM_CMPINT_LT); // only greater than
-        __mmask32 sign_eq = _mm512_cmpeq_epu16_mask(sign_x, sign_y);
-        __mmask32 neg = _mm512_mask_cmpeq_epu16_mask(
-                sign_eq,
-                sign_x,
-                _mm512_set1_epi16(0x8000)); // both numbers are -ve
-
-        // compare exponents only if signs are equal:
-        mask_ge = mask_ge
-                | _mm512_mask_cmp_epu16_mask(
-                          sign_eq, exp_x, exp_y, _MM_CMPINT_NLE);
-        // get mask for elements for which both sign and exponents are equal:
-        __mmask32 exp_eq = _mm512_mask_cmpeq_epu16_mask(sign_eq, exp_x, exp_y);
-
-        // compare mantissa for elements for which both sign and expponent are equal:
-        mask_ge = mask_ge
-                | _mm512_mask_cmp_epu16_mask(
-                          exp_eq, mant_x, mant_y, _MM_CMPINT_NLT);
-        return _kxor_mask32(mask_ge, neg);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(y, ge(x, y), x);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        // AVX512_VBMI2
-        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        // AVX512BW
-        return _mm512_mask_loadu_epi16(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi16(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(x, ge(x, y), y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi16(idx, zmm);
-    }
-    // Apparently this is a terrible for perf, npy_half_to_float seems to work
-    // better
-    //static float uint16_to_float(uint16_t val)
-    //{
-    //    // Ideally use _mm_loadu_si16, but its only gcc > 11.x
-    //    // TODO: use inline ASM? https://godbolt.org/z/aGYvh7fMM
-    //    __m128i xmm = _mm_maskz_loadu_epi16(0x01, &val);
-    //    __m128 xmm2 = _mm_cvtph_ps(xmm);
-    //    return _mm_cvtss_f32(xmm2);
-    //}
-    static type_t float_to_uint16(float val)
-    {
-        __m128 xmm = _mm_load_ss(&val);
-        __m128i xmm2 = _mm_cvtps_ph(xmm, _MM_FROUND_NO_EXC);
-        return _mm_extract_epi16(xmm2, 0);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0));
-        __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1));
-        float lo_max = _mm512_reduce_max_ps(lo);
-        float hi_max = _mm512_reduce_max_ps(hi);
-        return float_to_uint16(std::max(lo_max, hi_max));
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        __m512 lo = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 0));
-        __m512 hi = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(v, 1));
-        float lo_max = _mm512_reduce_min_ps(lo);
-        float hi_max = _mm512_reduce_min_ps(hi);
-        return float_to_uint16(std::min(lo_max, hi_max));
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi16(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
-        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-};
-
-template <>
-struct zmm_vector<int16_t> {
-    using type_t = int16_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-    using opmask_t = __mmask32;
-    static const uint8_t numlanes = 32;
-
-    static zmm_t get_network(int index)
-    {
-        return _mm512_loadu_si512(&network[index - 1][0]);
-    }
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_MAX_INT16;
-    }
-    static type_t type_min()
-    {
-        return X86_SIMD_SORT_MIN_INT16;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi16(type_max());
-    }
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return _knot_mask32(x);
-    }
-
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_epi16(x, y);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        // AVX512_VBMI2
-        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        // AVX512BW
-        return _mm512_mask_loadu_epi16(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi16(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_epi16(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi16(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
-        zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo);
-        type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi);
-        return std::max(lo_max, hi_max);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        zmm_t lo = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 0));
-        zmm_t hi = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo);
-        type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi);
-        return std::min(lo_min, hi_min);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi16(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
-        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-};
-template <>
-struct zmm_vector<uint16_t> {
-    using type_t = uint16_t;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-    using opmask_t = __mmask32;
-    static const uint8_t numlanes = 32;
-
-    static zmm_t get_network(int index)
-    {
-        return _mm512_loadu_si512(&network[index - 1][0]);
-    }
-    static type_t type_max()
-    {
-        return X86_SIMD_SORT_MAX_UINT16;
-    }
-    static type_t type_min()
-    {
-        return 0;
-    }
-    static zmm_t zmm_max()
-    {
-        return _mm512_set1_epi16(type_max());
-    }
-
-    static opmask_t knot_opmask(opmask_t x)
-    {
-        return _knot_mask32(x);
-    }
-    static opmask_t ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem)
-    {
-        return _mm512_loadu_si512(mem);
-    }
-    static zmm_t max(zmm_t x, zmm_t y)
-    {
-        return _mm512_max_epu16(x, y);
-    }
-    static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi16(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi16(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, opmask_t mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi16(x, mask, y);
-    }
-    static void mask_storeu(void *mem, opmask_t mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi16(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y)
-    {
-        return _mm512_min_epu16(x, y);
-    }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi16(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v)
-    {
-        zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
-        zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_max = (type_t)_mm512_reduce_max_epi32(lo);
-        type_t hi_max = (type_t)_mm512_reduce_max_epi32(hi);
-        return std::max(lo_max, hi_max);
-    }
-    static type_t reducemin(zmm_t v)
-    {
-        zmm_t lo = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 0));
-        zmm_t hi = _mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64(v, 1));
-        type_t lo_min = (type_t)_mm512_reduce_min_epi32(lo);
-        type_t hi_min = (type_t)_mm512_reduce_min_epi32(hi);
-        return std::min(lo_min, hi_min);
-    }
-    static zmm_t set1(type_t v)
-    {
-        return _mm512_set1_epi16(v);
-    }
-    template <uint8_t mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        zmm = _mm512_shufflehi_epi16(zmm, (_MM_PERM_ENUM)mask);
-        return _mm512_shufflelo_epi16(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-};
-
 template <>
 bool comparison_func<zmm_vector<float16>>(const uint16_t &a, const uint16_t &b)
 {