Fix bug in avx512fp16 nan processing

Raghuveer Devulapalli · Raghuveer Devulapalli · commit 3ddc914c9cc0 · 2023-08-07T14:11:39.000-07:00
diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp
@@ -256,6 +256,10 @@ struct zmm_vector<float> {
     {
         return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
     }
+    static opmask_t get_partial_loadmask(int size)
+    {
+        return (0x0001 << size) - 0x0001;
+    }
     template <int type>
     static opmask_t fpclass(zmm_t x)
     {
diff --git a/src/avx512-64bit-argsort.hpp b/src/avx512-64bit-argsort.hpp
@@ -344,33 +344,6 @@ static void argselect_64bit_(type_t *arr,
                 arr, arg, pos, pivot_index, right, max_iters - 1);
 }
 
-template <typename vtype, typename type_t>
-bool has_nan(type_t *arr, int64_t arrsize)
-{
-    using opmask_t = typename vtype::opmask_t;
-    using zmm_t = typename vtype::zmm_t;
-    bool found_nan = false;
-    opmask_t loadmask = 0xFF;
-    zmm_t in;
-    while (arrsize > 0) {
-        if (arrsize < vtype::numlanes) {
-            loadmask = (0x01 << arrsize) - 0x01;
-            in = vtype::maskz_loadu(loadmask, arr);
-        }
-        else {
-            in = vtype::loadu(arr);
-        }
-        opmask_t nanmask = vtype::template fpclass<0x01 | 0x80>(in);
-        arr += vtype::numlanes;
-        arrsize -= vtype::numlanes;
-        if (nanmask != 0x00) {
-            found_nan = true;
-            break;
-        }
-    }
-    return found_nan;
-}
-
 /* argsort methods for 32-bit and 64-bit dtypes */
 template <typename T>
 void avx512_argsort(T *arr, int64_t *arg, int64_t arrsize)
diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
@@ -71,6 +71,10 @@ struct ymm_vector<float> {
     {
         return _mm256_cmp_ps_mask(x, y, _CMP_EQ_OQ);
     }
+    static opmask_t get_partial_loadmask(int size)
+    {
+        return (0x01 << size) - 0x01;
+    }
     template <int type>
     static opmask_t fpclass(zmm_t x)
     {
@@ -703,6 +707,10 @@ struct zmm_vector<double> {
     {
         return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ);
     }
+    static opmask_t get_partial_loadmask(int size)
+    {
+        return (0x01 << size) - 0x01;
+    }
     template <int type>
     static opmask_t fpclass(zmm_t x)
     {
diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h
@@ -100,18 +100,17 @@ bool is_a_nan(T elem)
     return std::isnan(elem);
 }
 
-template <typename vtype, typename type_t>
-int64_t replace_nan_with_inf(type_t *arr, int64_t arrsize)
+template <typename vtype, typename T>
+int64_t replace_nan_with_inf(T *arr, int64_t arrsize)
 {
     int64_t nan_count = 0;
     using opmask_t = typename vtype::opmask_t;
     using zmm_t = typename vtype::zmm_t;
-    bool found_nan = false;
-    opmask_t loadmask = 0xFF;
+    opmask_t loadmask;
     zmm_t in;
     while (arrsize > 0) {
         if (arrsize < vtype::numlanes) {
-            loadmask = (0x01 << arrsize) - 0x01;
+            loadmask = vtype::get_partial_loadmask(arrsize);
             in = vtype::maskz_loadu(loadmask, arr);
         }
         else {
@@ -126,6 +125,33 @@ int64_t replace_nan_with_inf(type_t *arr, int64_t arrsize)
     return nan_count;
 }
 
+template <typename vtype, typename type_t>
+bool has_nan(type_t *arr, int64_t arrsize)
+{
+    using opmask_t = typename vtype::opmask_t;
+    using zmm_t = typename vtype::zmm_t;
+    bool found_nan = false;
+    opmask_t loadmask;
+    zmm_t in;
+    while (arrsize > 0) {
+        if (arrsize < vtype::numlanes) {
+            loadmask = vtype::get_partial_loadmask(arrsize);
+            in = vtype::maskz_loadu(loadmask, arr);
+        }
+        else {
+            in = vtype::loadu(arr);
+        }
+        opmask_t nanmask = vtype::template fpclass<0x01 | 0x80>(in);
+        arr += vtype::numlanes;
+        arrsize -= vtype::numlanes;
+        if (nanmask != 0x00) {
+            found_nan = true;
+            break;
+        }
+    }
+    return found_nan;
+}
+
 template<typename type_t>
 void replace_inf_with_nan(type_t *arr, int64_t arrsize, int64_t nan_count)
 {
diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp
@@ -46,11 +46,19 @@ struct zmm_vector<_Float16> {
     {
         return _knot_mask32(x);
     }
-
     static opmask_t ge(zmm_t x, zmm_t y)
     {
         return _mm512_cmp_ph_mask(x, y, _CMP_GE_OQ);
     }
+    static opmask_t get_partial_loadmask(int size)
+    {
+        return (0x00000001 << size) - 0x00000001;
+    }
+    template <int type>
+    static opmask_t fpclass(zmm_t x)
+    {
+        return _mm512_fpclass_ph_mask(x, type);
+    }
     static zmm_t loadu(void const *mem)
     {
         return _mm512_loadu_ph(mem);
@@ -65,6 +73,11 @@ struct zmm_vector<_Float16> {
         // AVX512_VBMI2
         return _mm512_mask_compressstoreu_epi16(mem, mask, temp);
     }
+    static zmm_t maskz_loadu(opmask_t mask, void const *mem)
+    {
+        return _mm512_castsi512_ph(
+                _mm512_maskz_loadu_epi16(mask, mem));
+    }
     static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
     {
         // AVX512BW
@@ -140,4 +153,21 @@ void qsort_<zmm_vector<_Float16>>(_Float16* arr, int64_t left, int64_t right, in
 {
     qsort_16bit_<zmm_vector<_Float16>>(arr, left, right, maxiters);
 }
+
+template<>
+void replace_inf_with_nan(_Float16 *arr, int64_t arrsize, int64_t nan_count)
+{
+    memset(arr + arrsize - nan_count, 0xFF, nan_count * 2);
+}
+
+template<>
+void avx512_qsort(_Float16 *arr, int64_t arrsize)
+{
+    if (arrsize > 1) {
+        int64_t nan_count = replace_nan_with_inf<zmm_vector<_Float16>, _Float16>(arr, arrsize);
+        qsort_16bit_<zmm_vector<_Float16>, _Float16>(
+                arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize));
+        replace_inf_with_nan(arr, arrsize, nan_count);
+    }
+}
 #endif // AVX512FP16_QSORT_16BIT

Original file line number	Diff line number	Diff line change
`@@ -256,6 +256,10 @@ struct zmm_vector<float> {`
`256`	`256`	`{`
`257`	`257`	`return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);`
`258`	`258`	`}`
	`259`	`+ static opmask_t get_partial_loadmask(int size)`
	`260`	`+ {`
	`261`	`+ return (0x0001 << size) - 0x0001;`
	`262`	`+ }`
`259`	`263`	`template <int type>`
`260`	`264`	`static opmask_t fpclass(zmm_t x)`
`261`	`265`	`{`
Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,10 @@ struct ymm_vector<float> {`
`71`	`71`	`{`
`72`	`72`	`return _mm256_cmp_ps_mask(x, y, _CMP_EQ_OQ);`
`73`	`73`	`}`
	`74`	`+ static opmask_t get_partial_loadmask(int size)`
	`75`	`+ {`
	`76`	`+ return (0x01 << size) - 0x01;`
	`77`	`+ }`
`74`	`78`	`template <int type>`
`75`	`79`	`static opmask_t fpclass(zmm_t x)`
`76`	`80`	`{`
`@@ -703,6 +707,10 @@ struct zmm_vector<double> {`
`703`	`707`	`{`
`704`	`708`	`return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ);`
`705`	`709`	`}`
	`710`	`+ static opmask_t get_partial_loadmask(int size)`
	`711`	`+ {`
	`712`	`+ return (0x01 << size) - 0x01;`
	`713`	`+ }`
`706`	`714`	`template <int type>`
`707`	`715`	`static opmask_t fpclass(zmm_t x)`
`708`	`716`	`{`