Get rid of the avx2_mask_helper

Raghuveer Devulapalli · Raghuveer Devulapalli · commit 9ad4432e6d44 · 2023-10-23T13:01:27.000-07:00
diff --git a/src/avx2-32bit-common.h b/src/avx2-32bit-common.h
@@ -15,7 +15,7 @@
  * sorting network (see
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
  */
- 
+
 // ymm                  7, 6, 5, 4, 3, 2, 1, 0
 #define NETWORK_32BIT_AVX2_1 4, 5, 6, 7, 0, 1, 2, 3
 #define NETWORK_32BIT_AVX2_2 0, 1, 2, 3, 4, 5, 6, 7
@@ -58,11 +58,11 @@ struct avx2_vector<int32_t> {
     using type_t = int32_t;
     using reg_t = __m256i;
     using ymmi_t = __m256i;
-    using opmask_t = avx2_mask_helper32;
+    using opmask_t = __m256i;
     static const uint8_t numlanes = 8;
     static constexpr int network_sort_threshold = 256;
     static constexpr int partition_unroll_factor = 4;
-    
+
     using swizzle_ops = avx2_32bit_swizzle_ops;
 
     static type_t type_max()
@@ -77,7 +77,11 @@ struct avx2_vector<int32_t> {
     {
         return _mm256_set1_epi32(type_max());
     } // TODO: this should broadcast bits as is?
-
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        auto mask = ((0x1ull << num_to_read) - 0x1ull);
+        return convert_int_to_avx2_mask(mask);
+    }
     static ymmi_t
     seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
     {
@@ -215,11 +219,11 @@ struct avx2_vector<uint32_t> {
     using type_t = uint32_t;
     using reg_t = __m256i;
     using ymmi_t = __m256i;
-    using opmask_t = avx2_mask_helper32;
+    using opmask_t = __m256i;
     static const uint8_t numlanes = 8;
     static constexpr int network_sort_threshold = 256;
     static constexpr int partition_unroll_factor = 4;
-    
+
     using swizzle_ops = avx2_32bit_swizzle_ops;
 
     static type_t type_max()
@@ -234,7 +238,11 @@ struct avx2_vector<uint32_t> {
     {
         return _mm256_set1_epi32(type_max());
     }
-
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        auto mask = ((0x1ull << num_to_read) - 0x1ull);
+        return convert_int_to_avx2_mask(mask);
+    }
     static ymmi_t
     seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
     {
@@ -357,11 +365,11 @@ struct avx2_vector<float> {
     using type_t = float;
     using reg_t = __m256;
     using ymmi_t = __m256i;
-    using opmask_t = avx2_mask_helper32;
+    using opmask_t = __m256i;
     static const uint8_t numlanes = 8;
     static constexpr int network_sort_threshold = 256;
     static constexpr int partition_unroll_factor = 4;
-    
+
     using swizzle_ops = avx2_32bit_swizzle_ops;
 
     static type_t type_max()
@@ -399,9 +407,14 @@ struct avx2_vector<float> {
     {
         return _mm256_castps_si256(_mm256_cmp_ps(x, y, _CMP_EQ_OQ));
     }
-    static opmask_t get_partial_loadmask(int size)
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        auto mask = ((0x1ull << num_to_read) - 0x1ull);
+        return convert_int_to_avx2_mask(mask);
+    }
+    static int32_t convert_mask_to_int(opmask_t mask)
     {
-        return (0x0001 << size) - 0x0001;
+        return convert_avx2_mask_to_int(mask);
     }
     template <int type>
     static opmask_t fpclass(reg_t x)
diff --git a/src/avx2-emu-funcs.hpp b/src/avx2-emu-funcs.hpp
@@ -46,50 +46,21 @@ constexpr auto avx2_compressstore_lut32_gen = [] {
     }
     return lutPair;
 }();
+
 constexpr auto avx2_compressstore_lut32_perm = avx2_compressstore_lut32_gen[0];
 constexpr auto avx2_compressstore_lut32_left = avx2_compressstore_lut32_gen[1];
 
-struct avx2_mask_helper32 {
-    __m256i mask;
-
-    avx2_mask_helper32() = default;
-    avx2_mask_helper32(int m)
-    {
-        mask = converter(m);
-    }
-    avx2_mask_helper32(__m256i m)
-    {
-        mask = m;
-    }
-    operator __m256i()
-    {
-        return mask;
-    }
-    operator int32_t()
-    {
-        return converter(mask);
-    }
-    __m256i operator=(int m)
-    {
-        mask = converter(m);
-        return mask;
-    }
-
-private:
-    __m256i converter(int m)
-    {
-        return _mm256_loadu_si256(
-                (const __m256i *)avx2_mask_helper_lut32[m].data());
-    }
+X86_SIMD_SORT_INLINE
+__m256i convert_int_to_avx2_mask(int32_t m)
+{
+    return _mm256_loadu_si256(
+            (const __m256i *)avx2_mask_helper_lut32[m].data());
+}
 
-    int32_t converter(__m256i m)
-    {
-        return _mm256_movemask_ps(_mm256_castsi256_ps(m));
-    }
-};
-static __m256i operator~(const avx2_mask_helper32 x)
+X86_SIMD_SORT_INLINE
+int32_t convert_avx2_mask_to_int(__m256i m)
 {
-    return ~x.mask;
+    return _mm256_movemask_ps(_mm256_castsi256_ps(m));
 }
 
 // Emulators for intrinsics missing from AVX2 compared to AVX512
@@ -98,7 +69,7 @@ T avx2_emu_reduce_max32(typename avx2_vector<T>::reg_t x)
 {
     using vtype = avx2_vector<T>;
     using reg_t = typename vtype::reg_t;
-    
+
     reg_t inter1 = vtype::max(x, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(x));
     reg_t inter2 = vtype::max(inter1, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(inter1));
     T can1 = vtype::template extract<0>(inter2);
@@ -111,7 +82,7 @@ T avx2_emu_reduce_min32(typename avx2_vector<T>::reg_t x)
 {
     using vtype = avx2_vector<T>;
     using reg_t = typename vtype::reg_t;
-    
+
     reg_t inter1 = vtype::min(x, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(x));
     reg_t inter2 = vtype::min(inter1, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(inter1));
     T can1 = vtype::template extract<0>(inter2);
@@ -128,7 +99,7 @@ void avx2_emu_mask_compressstoreu(void *base_addr,
 
     T *leftStore = (T *)base_addr;
 
-    int32_t shortMask = avx2_mask_helper32(k);
+    int32_t shortMask = convert_avx2_mask_to_int(k);
     const __m256i &perm = _mm256_loadu_si256(
             (const __m256i *)avx2_compressstore_lut32_perm[shortMask].data());
     const __m256i &left = _mm256_loadu_si256(
@@ -150,7 +121,7 @@ int avx2_double_compressstore32(void *left_addr,
     T *leftStore = (T *)left_addr;
     T *rightStore = (T *)right_addr;
 
-    int32_t shortMask = avx2_mask_helper32(k);
+    int32_t shortMask = convert_avx2_mask_to_int(k);
     const __m256i &perm = _mm256_loadu_si256(
             (const __m256i *)avx2_compressstore_lut32_perm[shortMask].data());
     const __m256i &left = _mm256_loadu_si256(
@@ -186,4 +157,4 @@ typename avx2_vector<T>::reg_t avx2_emu_min(typename avx2_vector<T>::reg_t x,
                                                 _mm256_castsi256_pd(nlt)));
 }
 
-#endif
+#endif
diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp
@@ -80,6 +80,14 @@ struct zmm_vector<float16> {
                           exp_eq, mant_x, mant_y, _MM_CMPINT_NLT);
         return _kxor_mask32(mask_ge, neg);
     }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
+    static int32_t convert_mask_to_int(opmask_t mask)
+    {
+        return mask;
+    }
     static reg_t loadu(void const *mem)
     {
         return _mm512_loadu_si512(mem);
@@ -227,6 +235,10 @@ struct zmm_vector<int16_t> {
     {
         return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
     static reg_t loadu(void const *mem)
     {
         return _mm512_loadu_si512(mem);
@@ -357,6 +369,10 @@ struct zmm_vector<uint16_t> {
     {
         return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
     static reg_t loadu(void const *mem)
     {
         return _mm512_loadu_si512(mem);
diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp
@@ -65,6 +65,10 @@ struct zmm_vector<int32_t> {
     {
         return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
     template <int scale>
     static halfreg_t i64gather(__m512i index, void const *base)
     {
@@ -209,6 +213,10 @@ struct zmm_vector<uint32_t> {
     {
         return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
     static reg_t loadu(void const *mem)
     {
         return _mm512_loadu_si512(mem);
@@ -333,9 +341,13 @@ struct zmm_vector<float> {
     {
         return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
     }
-    static opmask_t get_partial_loadmask(int size)
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
+    static int32_t convert_mask_to_int(opmask_t mask)
     {
-        return (0x0001 << size) - 0x0001;
+        return mask;
     }
     template <int type>
     static opmask_t fpclass(reg_t x)
diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
@@ -81,9 +81,9 @@ struct ymm_vector<float> {
     {
         return _mm256_cmp_ps_mask(x, y, _CMP_EQ_OQ);
     }
-    static opmask_t get_partial_loadmask(int size)
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
-        return (0x01 << size) - 0x01;
+        return ((0x1ull << num_to_read) - 0x1ull);
     }
     template <int type>
     static opmask_t fpclass(reg_t x)
@@ -244,6 +244,10 @@ struct ymm_vector<uint32_t> {
     {
         return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
     static opmask_t eq(reg_t x, reg_t y)
     {
         return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_EQ);
@@ -396,6 +400,10 @@ struct ymm_vector<int32_t> {
     {
         return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
     static opmask_t eq(reg_t x, reg_t y)
     {
         return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
@@ -557,6 +565,10 @@ struct zmm_vector<int64_t> {
     {
         return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
     static opmask_t eq(reg_t x, reg_t y)
     {
         return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ);
@@ -745,6 +757,10 @@ struct zmm_vector<uint64_t> {
     {
         return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
     static opmask_t eq(reg_t x, reg_t y)
     {
         return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_EQ);
@@ -894,9 +910,13 @@ struct zmm_vector<double> {
     {
         return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ);
     }
-    static opmask_t get_partial_loadmask(int size)
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
+    {
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
+    static int32_t convert_mask_to_int(opmask_t mask)
     {
-        return (0x01 << size) - 0x01;
+        return mask;
     }
     template <int type>
     static opmask_t fpclass(reg_t x)
diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp
@@ -54,9 +54,13 @@ struct zmm_vector<_Float16> {
     {
         return _mm512_cmp_ph_mask(x, y, _CMP_GE_OQ);
     }
-    static opmask_t get_partial_loadmask(int size)
+    static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
-        return (0x00000001 << size) - 0x00000001;
+        return ((0x1ull << num_to_read) - 0x1ull);
+    }
+    static int32_t convert_mask_to_int(opmask_t mask)
+    {
+        return mask;
     }
     template <int type>
     static opmask_t fpclass(reg_t x)
diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h
@@ -65,7 +65,7 @@ X86_SIMD_SORT_INLINE arrsize_t replace_nan_with_inf(T *arr, arrsize_t size)
             in = vtype::loadu(arr + ii);
         }
         opmask_t nanmask = vtype::template fpclass<0x01 | 0x80>(in);
-        nan_count += _mm_popcnt_u32((int32_t)nanmask);
+        nan_count += _mm_popcnt_u32(vtype::convert_mask_to_int(nanmask));
         vtype::mask_storeu(arr + ii, nanmask, vtype::zmm_max());
     }
     return nan_count;
@@ -174,7 +174,7 @@ int avx512_double_compressstore(type_t *left_addr,
     vtype::mask_compressstoreu(left_addr, vtype::knot_opmask(k), reg);
     vtype::mask_compressstoreu(
             right_addr + vtype::numlanes - amount_ge_pivot, k, reg);
-    
+
     return amount_ge_pivot;
 }
 
@@ -188,7 +188,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store,
                                              reg_t &biggest_vec)
 {
     typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);
-    
+
     int amount_ge_pivot = vtype::double_compressstore(l_store, r_store, ge_mask, curr_vec);
 
     smallest_vec = vtype::min(curr_vec, smallest_vec);
diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp

Original file line number	Diff line number	Diff line change
`@@ -80,6 +80,14 @@ struct zmm_vector<float16> {`
`80`	`80`	`exp_eq, mant_x, mant_y, _MM_CMPINT_NLT);`
`81`	`81`	`return _kxor_mask32(mask_ge, neg);`
`82`	`82`	`}`
	`83`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`84`	`+ {`
	`85`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`86`	`+ }`
	`87`	`+ static int32_t convert_mask_to_int(opmask_t mask)`
	`88`	`+ {`
	`89`	`+ return mask;`
	`90`	`+ }`
`83`	`91`	`static reg_t loadu(void const *mem)`
`84`	`92`	`{`
`85`	`93`	`return _mm512_loadu_si512(mem);`
`@@ -227,6 +235,10 @@ struct zmm_vector<int16_t> {`
`227`	`235`	`{`
`228`	`236`	`return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);`
`229`	`237`	`}`
	`238`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`239`	`+ {`
	`240`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`241`	`+ }`
`230`	`242`	`static reg_t loadu(void const *mem)`
`231`	`243`	`{`
`232`	`244`	`return _mm512_loadu_si512(mem);`
`@@ -357,6 +369,10 @@ struct zmm_vector<uint16_t> {`
`357`	`369`	`{`
`358`	`370`	`return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT);`
`359`	`371`	`}`
	`372`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`373`	`+ {`
	`374`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`375`	`+ }`
`360`	`376`	`static reg_t loadu(void const *mem)`
`361`	`377`	`{`
`362`	`378`	`return _mm512_loadu_si512(mem);`
Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,10 @@ struct zmm_vector<int32_t> {`
`65`	`65`	`{`
`66`	`66`	`return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);`
`67`	`67`	`}`
	`68`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`69`	`+ {`
	`70`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`71`	`+ }`
`68`	`72`	`template <int scale>`
`69`	`73`	`static halfreg_t i64gather(__m512i index, void const *base)`
`70`	`74`	`{`
`@@ -209,6 +213,10 @@ struct zmm_vector<uint32_t> {`
`209`	`213`	`{`
`210`	`214`	`return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);`
`211`	`215`	`}`
	`216`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`217`	`+ {`
	`218`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`219`	`+ }`
`212`	`220`	`static reg_t loadu(void const *mem)`
`213`	`221`	`{`
`214`	`222`	`return _mm512_loadu_si512(mem);`
`@@ -333,9 +341,13 @@ struct zmm_vector<float> {`
`333`	`341`	`{`
`334`	`342`	`return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);`
`335`	`343`	`}`
`336`		`- static opmask_t get_partial_loadmask(int size)`
	`344`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`345`	`+ {`
	`346`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`347`	`+ }`
	`348`	`+ static int32_t convert_mask_to_int(opmask_t mask)`
`337`	`349`	`{`
`338`		`- return (0x0001 << size) - 0x0001;`
	`350`	`+ return mask;`
`339`	`351`	`}`
`340`	`352`	`template <int type>`
`341`	`353`	`static opmask_t fpclass(reg_t x)`
Original file line number	Diff line number	Diff line change
`@@ -81,9 +81,9 @@ struct ymm_vector<float> {`
`81`	`81`	`{`
`82`	`82`	`return _mm256_cmp_ps_mask(x, y, _CMP_EQ_OQ);`
`83`	`83`	`}`
`84`		`- static opmask_t get_partial_loadmask(int size)`
	`84`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`85`	`85`	`{`
`86`		`- return (0x01 << size) - 0x01;`
	`86`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
`87`	`87`	`}`
`88`	`88`	`template <int type>`
`89`	`89`	`static opmask_t fpclass(reg_t x)`
`@@ -244,6 +244,10 @@ struct ymm_vector<uint32_t> {`
`244`	`244`	`{`
`245`	`245`	`return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);`
`246`	`246`	`}`
	`247`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`248`	`+ {`
	`249`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`250`	`+ }`
`247`	`251`	`static opmask_t eq(reg_t x, reg_t y)`
`248`	`252`	`{`
`249`	`253`	`return _mm256_cmp_epu32_mask(x, y, _MM_CMPINT_EQ);`
`@@ -396,6 +400,10 @@ struct ymm_vector<int32_t> {`
`396`	`400`	`{`
`397`	`401`	`return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);`
`398`	`402`	`}`
	`403`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`404`	`+ {`
	`405`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`406`	`+ }`
`399`	`407`	`static opmask_t eq(reg_t x, reg_t y)`
`400`	`408`	`{`
`401`	`409`	`return _mm256_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);`
`@@ -557,6 +565,10 @@ struct zmm_vector<int64_t> {`
`557`	`565`	`{`
`558`	`566`	`return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_NLT);`
`559`	`567`	`}`
	`568`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`569`	`+ {`
	`570`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`571`	`+ }`
`560`	`572`	`static opmask_t eq(reg_t x, reg_t y)`
`561`	`573`	`{`
`562`	`574`	`return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ);`
`@@ -745,6 +757,10 @@ struct zmm_vector<uint64_t> {`
`745`	`757`	`{`
`746`	`758`	`return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_NLT);`
`747`	`759`	`}`
	`760`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`761`	`+ {`
	`762`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`763`	`+ }`
`748`	`764`	`static opmask_t eq(reg_t x, reg_t y)`
`749`	`765`	`{`
`750`	`766`	`return _mm512_cmp_epu64_mask(x, y, _MM_CMPINT_EQ);`
`@@ -894,9 +910,13 @@ struct zmm_vector<double> {`
`894`	`910`	`{`
`895`	`911`	`return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ);`
`896`	`912`	`}`
`897`		`- static opmask_t get_partial_loadmask(int size)`
	`913`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
	`914`	`+ {`
	`915`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`916`	`+ }`
	`917`	`+ static int32_t convert_mask_to_int(opmask_t mask)`
`898`	`918`	`{`
`899`		`- return (0x01 << size) - 0x01;`
	`919`	`+ return mask;`
`900`	`920`	`}`
`901`	`921`	`template <int type>`
`902`	`922`	`static opmask_t fpclass(reg_t x)`
Original file line number	Diff line number	Diff line change
`@@ -54,9 +54,13 @@ struct zmm_vector<_Float16> {`
`54`	`54`	`{`
`55`	`55`	`return _mm512_cmp_ph_mask(x, y, _CMP_GE_OQ);`
`56`	`56`	`}`
`57`		`- static opmask_t get_partial_loadmask(int size)`
	`57`	`+ static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`58`	`58`	`{`
`59`		`- return (0x00000001 << size) - 0x00000001;`
	`59`	`+ return ((0x1ull << num_to_read) - 0x1ull);`
	`60`	`+ }`
	`61`	`+ static int32_t convert_mask_to_int(opmask_t mask)`
	`62`	`+ {`
	`63`	`+ return mask;`
`60`	`64`	`}`
`61`	`65`	`template <int type>`
`62`	`66`	`static opmask_t fpclass(reg_t x)`
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ X86_SIMD_SORT_INLINE arrsize_t replace_nan_with_inf(T *arr, arrsize_t size)`
`65`	`65`	`in = vtype::loadu(arr + ii);`
`66`	`66`	`}`
`67`	`67`	`opmask_t nanmask = vtype::template fpclass<0x01 \| 0x80>(in);`
`68`		`- nan_count += _mm_popcnt_u32((int32_t)nanmask);`
	`68`	`+ nan_count += _mm_popcnt_u32(vtype::convert_mask_to_int(nanmask));`
`69`	`69`	`vtype::mask_storeu(arr + ii, nanmask, vtype::zmm_max());`
`70`	`70`	`}`
`71`	`71`	`return nan_count;`
`@@ -174,7 +174,7 @@ int avx512_double_compressstore(type_t *left_addr,`
`174`	`174`	`vtype::mask_compressstoreu(left_addr, vtype::knot_opmask(k), reg);`
`175`	`175`	`vtype::mask_compressstoreu(`
`176`	`176`	`right_addr + vtype::numlanes - amount_ge_pivot, k, reg);`
`177`		`-`
	`177`	`+`
`178`	`178`	`return amount_ge_pivot;`
`179`	`179`	`}`
`180`	`180`
`@@ -188,7 +188,7 @@ X86_SIMD_SORT_INLINE arrsize_t partition_vec(type_t *l_store,`
`188`	`188`	`reg_t &biggest_vec)`
`189`	`189`	`{`
`190`	`190`	`typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec);`
`191`		`-`
	`191`	`+`
`192`	`192`	`int amount_ge_pivot = vtype::double_compressstore(l_store, r_store, ge_mask, curr_vec);`
`193`	`193`
`194`	`194`	`smallest_vec = vtype::min(curr_vec, smallest_vec);`