Merge pull request #33 from r-devulap/256-network

Raghuveer Devulapalli · web-flow · commit ac6c10cd71a8 · 2023-04-25T20:47:18.000-07:00
Improve qsort
diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h
@@ -290,10 +290,11 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
 }
 
 template <typename vtype, typename type_t>
-static void
-qselect_16bit_(type_t *arr, int64_t pos,
-               int64_t left, int64_t right,
-               int64_t max_iters)
+static void qselect_16bit_(type_t *arr,
+                           int64_t pos,
+                           int64_t left,
+                           int64_t right,
+                           int64_t max_iters)
 {
     /*
      * Resort to std::sort if quicksort isnt making any progress
diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp
@@ -648,7 +648,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
     type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
-    int64_t pivot_index = partition_avx512<vtype>(
+    int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
             arr, left, right + 1, pivot, &smallest, &biggest);
     if (pivot != smallest)
         qsort_32bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
@@ -657,10 +657,11 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
 }
 
 template <typename vtype, typename type_t>
-static void
-qselect_32bit_(type_t *arr, int64_t pos,
-               int64_t left, int64_t right,
-               int64_t max_iters)
+static void qselect_32bit_(type_t *arr,
+                           int64_t pos,
+                           int64_t left,
+                           int64_t right,
+                           int64_t max_iters)
 {
     /*
      * Resort to std::sort if quicksort isnt making any progress
@@ -680,7 +681,7 @@ qselect_32bit_(type_t *arr, int64_t pos,
     type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
-    int64_t pivot_index = partition_avx512<vtype>(
+    int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
             arr, left, right + 1, pivot, &smallest, &biggest);
     if ((pivot != smallest) && (pos < pivot_index))
         qselect_32bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);
diff --git a/src/avx512-64bit-qsort.hpp b/src/avx512-64bit-qsort.hpp
diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h
@@ -95,7 +95,8 @@ void avx512_qselect(T *arr, int64_t k, int64_t arrsize);
 void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize);
 
 template <typename T>
-inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) {
+inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize)
+{
     avx512_qselect<T>(arr, k - 1, arrsize);
     avx512_qsort<T>(arr, k - 1);
 }
@@ -259,4 +260,123 @@ static inline int64_t partition_avx512(type_t *arr,
     *biggest = vtype::reducemax(max_vec);
     return l_store;
 }
+
+template <typename vtype,
+          int num_unroll,
+          typename type_t = typename vtype::type_t>
+static inline int64_t partition_avx512_unrolled(type_t *arr,
+                                                int64_t left,
+                                                int64_t right,
+                                                type_t pivot,
+                                                type_t *smallest,
+                                                type_t *biggest)
+{
+    if (right - left <= 2 * num_unroll * vtype::numlanes) {
+        return partition_avx512<vtype>(
+                arr, left, right, pivot, smallest, biggest);
+    }
+    /* make array length divisible by 8*vtype::numlanes , shortening the array */
+    for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0;
+         --i) {
+        *smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
+        *biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
+        if (!comparison_func<vtype>(arr[left], pivot)) {
+            std::swap(arr[left], arr[--right]);
+        }
+        else {
+            ++left;
+        }
+    }
+
+    if (left == right)
+        return left; /* less than vtype::numlanes elements in the array */
+
+    using zmm_t = typename vtype::zmm_t;
+    zmm_t pivot_vec = vtype::set1(pivot);
+    zmm_t min_vec = vtype::set1(*smallest);
+    zmm_t max_vec = vtype::set1(*biggest);
+
+    // We will now have atleast 16 registers worth of data to process:
+    // left and right vtype::numlanes values are partitioned at the end
+    zmm_t vec_left[num_unroll], vec_right[num_unroll];
+#pragma GCC unroll 8
+    for (int ii = 0; ii < num_unroll; ++ii) {
+        vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
+        vec_right[ii] = vtype::loadu(
+                arr + (right - vtype::numlanes * (num_unroll - ii)));
+    }
+    // store points of the vectors
+    int64_t r_store = right - vtype::numlanes;
+    int64_t l_store = left;
+    // indices for loading the elements
+    left += num_unroll * vtype::numlanes;
+    right -= num_unroll * vtype::numlanes;
+    while (right - left != 0) {
+        zmm_t curr_vec[num_unroll];
+        /*
+         * if fewer elements are stored on the right side of the array,
+         * then next elements are loaded from the right side,
+         * otherwise from the left side
+         */
+        if ((r_store + vtype::numlanes) - right < left - l_store) {
+            right -= num_unroll * vtype::numlanes;
+#pragma GCC unroll 8
+            for (int ii = 0; ii < num_unroll; ++ii) {
+                curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
+            }
+        }
+        else {
+#pragma GCC unroll 8
+            for (int ii = 0; ii < num_unroll; ++ii) {
+                curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
+            }
+            left += num_unroll * vtype::numlanes;
+        }
+// partition the current vector and save it on both sides of the array
+#pragma GCC unroll 8
+        for (int ii = 0; ii < num_unroll; ++ii) {
+            int32_t amount_ge_pivot
+                    = partition_vec<vtype>(arr,
+                                           l_store,
+                                           r_store + vtype::numlanes,
+                                           curr_vec[ii],
+                                           pivot_vec,
+                                           &min_vec,
+                                           &max_vec);
+            l_store += (vtype::numlanes - amount_ge_pivot);
+            r_store -= amount_ge_pivot;
+        }
+    }
+
+/* partition and save vec_left[8] and vec_right[8] */
+#pragma GCC unroll 8
+    for (int ii = 0; ii < num_unroll; ++ii) {
+        int32_t amount_ge_pivot
+                = partition_vec<vtype>(arr,
+                                       l_store,
+                                       r_store + vtype::numlanes,
+                                       vec_left[ii],
+                                       pivot_vec,
+                                       &min_vec,
+                                       &max_vec);
+        l_store += (vtype::numlanes - amount_ge_pivot);
+        r_store -= amount_ge_pivot;
+    }
+#pragma GCC unroll 8
+    for (int ii = 0; ii < num_unroll; ++ii) {
+        int32_t amount_ge_pivot
+                = partition_vec<vtype>(arr,
+                                       l_store,
+                                       r_store + vtype::numlanes,
+                                       vec_right[ii],
+                                       pivot_vec,
+                                       &min_vec,
+                                       &max_vec);
+        l_store += (vtype::numlanes - amount_ge_pivot);
+        r_store -= amount_ge_pivot;
+    }
+    *smallest = vtype::reducemin(min_vec);
+    *biggest = vtype::reducemax(max_vec);
+    return l_store;
+}
 #endif // AVX512_QSORT_COMMON
diff --git a/tests/test_keyvalue.cpp b/tests/test_keyvalue.cpp
@@ -4,8 +4,8 @@
  * *******************************************/
 
 #include "avx512-64bit-keyvaluesort.hpp"
-#include "rand_array.h"
 #include "cpuinfo.h"
+#include "rand_array.h"
 #include <gtest/gtest.h>
 #include <vector>
 
diff --git a/tests/test_partial_qsort.hpp b/tests/test_partial_qsort.hpp
@@ -30,7 +30,8 @@ TYPED_TEST_P(avx512_partial_sort, test_ranges)
             int k = get_uniform_rand_array<int64_t>(1, arrsize, 1).front();
 
             /* Sort the range and verify all the required elements match the presorted set */
-            avx512_partial_qsort<TypeParam>(psortedarr.data(), k, psortedarr.size());
+            avx512_partial_qsort<TypeParam>(
+                    psortedarr.data(), k, psortedarr.size());
             for (size_t jj = 0; jj < k; jj++) {
                 ASSERT_EQ(sortedarr[jj], psortedarr[jj]);
             }
diff --git a/tests/test_qselect.hpp b/tests/test_qselect.hpp
@@ -5,7 +5,7 @@ class avx512_select : public ::testing::Test {
 };
 TYPED_TEST_SUITE_P(avx512_select);
 
-TYPED_TEST_P(avx512_select, test_arrsizes)
+TYPED_TEST_P(avx512_select, test_random)
 {
     if (cpu_has_avx512bw()) {
         if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
@@ -26,15 +26,16 @@ TYPED_TEST_P(avx512_select, test_arrsizes)
             std::sort(sortedarr.begin(), sortedarr.end());
             for (size_t k = 0; k < arr.size(); ++k) {
                 psortedarr = arr;
-                avx512_qselect<TypeParam>(psortedarr.data(), k, psortedarr.size());
+                avx512_qselect<TypeParam>(
+                        psortedarr.data(), k, psortedarr.size());
                 /* index k is correct */
                 ASSERT_EQ(sortedarr[k], psortedarr[k]);
                 /* Check left partition */
                 for (size_t jj = 0; jj < k; jj++) {
                     ASSERT_LE(psortedarr[jj], psortedarr[k]);
                 }
                 /* Check right partition */
-                for (size_t jj = k+1; jj < arr.size(); jj++) {
+                for (size_t jj = k + 1; jj < arr.size(); jj++) {
                     ASSERT_GE(psortedarr[jj], psortedarr[k]);
                 }
                 psortedarr.clear();
@@ -48,4 +49,48 @@ TYPED_TEST_P(avx512_select, test_arrsizes)
     }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_arrsizes);
+TYPED_TEST_P(avx512_select, test_small_range)
+{
+    if (cpu_has_avx512bw()) {
+        if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
+            GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
+        }
+        std::vector<int64_t> arrsizes;
+        for (int64_t ii = 0; ii < 1024; ++ii) {
+            arrsizes.push_back(ii);
+        }
+        std::vector<TypeParam> arr;
+        std::vector<TypeParam> sortedarr;
+        std::vector<TypeParam> psortedarr;
+        for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
+            /* Random array */
+            arr = get_uniform_rand_array<TypeParam>(arrsizes[ii], 20, 1);
+            sortedarr = arr;
+            /* Sort with std::sort for comparison */
+            std::sort(sortedarr.begin(), sortedarr.end());
+            for (size_t k = 0; k < arr.size(); ++k) {
+                psortedarr = arr;
+                avx512_qselect<TypeParam>(
+                        psortedarr.data(), k, psortedarr.size());
+                /* index k is correct */
+                ASSERT_EQ(sortedarr[k], psortedarr[k]);
+                /* Check left partition */
+                for (size_t jj = 0; jj < k; jj++) {
+                    ASSERT_LE(psortedarr[jj], psortedarr[k]);
+                }
+                /* Check right partition */
+                for (size_t jj = k + 1; jj < arr.size(); jj++) {
+                    ASSERT_GE(psortedarr[jj], psortedarr[k]);
+                }
+                psortedarr.clear();
+            }
+            arr.clear();
+            sortedarr.clear();
+        }
+    }
+    else {
+        GTEST_SKIP() << "Skipping this test, it requires avx512bw";
+    }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_random, test_small_range);
diff --git a/tests/test_qsort.hpp b/tests/test_qsort.hpp
@@ -10,7 +10,7 @@ class avx512_sort : public ::testing::Test {
 };
 TYPED_TEST_SUITE_P(avx512_sort);
 
-TYPED_TEST_P(avx512_sort, test_arrsizes)
+TYPED_TEST_P(avx512_sort, test_random)
 {
     if (cpu_has_avx512bw()) {
         if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
@@ -29,7 +29,7 @@ TYPED_TEST_P(avx512_sort, test_arrsizes)
             /* Sort with std::sort for comparison */
             std::sort(sortedarr.begin(), sortedarr.end());
             avx512_qsort<TypeParam>(arr.data(), arr.size());
-            ASSERT_EQ(sortedarr, arr);
+            ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
             arr.clear();
             sortedarr.clear();
         }
@@ -39,4 +39,97 @@ TYPED_TEST_P(avx512_sort, test_arrsizes)
     }
 }
 
-REGISTER_TYPED_TEST_SUITE_P(avx512_sort, test_arrsizes);
+TYPED_TEST_P(avx512_sort, test_reverse)
+{
+    if (cpu_has_avx512bw()) {
+        if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
+            GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
+        }
+        std::vector<int64_t> arrsizes;
+        for (int64_t ii = 0; ii < 1024; ++ii) {
+            arrsizes.push_back((TypeParam)(ii + 1));
+        }
+        std::vector<TypeParam> arr;
+        std::vector<TypeParam> sortedarr;
+        for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
+            /* reverse array */
+            for (int jj = 0; jj < arrsizes[ii]; ++jj) {
+                arr.push_back((TypeParam)(arrsizes[ii] - jj));
+            }
+            sortedarr = arr;
+            /* Sort with std::sort for comparison */
+            std::sort(sortedarr.begin(), sortedarr.end());
+            avx512_qsort<TypeParam>(arr.data(), arr.size());
+            ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
+            arr.clear();
+            sortedarr.clear();
+        }
+    }
+    else {
+        GTEST_SKIP() << "Skipping this test, it requires avx512bw";
+    }
+}
+
+TYPED_TEST_P(avx512_sort, test_constant)
+{
+    if (cpu_has_avx512bw()) {
+        if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
+            GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
+        }
+        std::vector<int64_t> arrsizes;
+        for (int64_t ii = 0; ii < 1024; ++ii) {
+            arrsizes.push_back((TypeParam)(ii + 1));
+        }
+        std::vector<TypeParam> arr;
+        std::vector<TypeParam> sortedarr;
+        for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
+            /* constant array */
+            for (int jj = 0; jj < arrsizes[ii]; ++jj) {
+                arr.push_back(ii);
+            }
+            sortedarr = arr;
+            /* Sort with std::sort for comparison */
+            std::sort(sortedarr.begin(), sortedarr.end());
+            avx512_qsort<TypeParam>(arr.data(), arr.size());
+            ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
+            arr.clear();
+            sortedarr.clear();
+        }
+    }
+    else {
+        GTEST_SKIP() << "Skipping this test, it requires avx512bw";
+    }
+}
+
+TYPED_TEST_P(avx512_sort, test_small_range)
+{
+    if (cpu_has_avx512bw()) {
+        if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
+            GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
+        }
+        std::vector<int64_t> arrsizes;
+        for (int64_t ii = 0; ii < 1024; ++ii) {
+            arrsizes.push_back((TypeParam)(ii + 1));
+        }
+        std::vector<TypeParam> arr;
+        std::vector<TypeParam> sortedarr;
+        for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
+            arr = get_uniform_rand_array<TypeParam>(arrsizes[ii], 20, 1);
+            sortedarr = arr;
+            /* Sort with std::sort for comparison */
+            std::sort(sortedarr.begin(), sortedarr.end());
+            avx512_qsort<TypeParam>(arr.data(), arr.size());
+            ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
+            arr.clear();
+            sortedarr.clear();
+        }
+    }
+    else {
+        GTEST_SKIP() << "Skipping this test, it requires avx512bw";
+    }
+}
+REGISTER_TYPED_TEST_SUITE_P(avx512_sort,
+                            test_random,
+                            test_reverse,
+                            test_constant,
+                            test_small_range);
diff --git a/tests/test_qsortfp16.cpp b/tests/test_qsortfp16.cpp
diff --git a/tests/test_sort.cpp b/tests/test_sort.cpp

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,8 @@ TYPED_TEST_P(avx512_partial_sort, test_ranges)`
`30`	`30`	`int k = get_uniform_rand_array<int64_t>(1, arrsize, 1).front();`
`31`	`31`
`32`	`32`	`/* Sort the range and verify all the required elements match the presorted set */`
`33`		`- avx512_partial_qsort<TypeParam>(psortedarr.data(), k, psortedarr.size());`
	`33`	`+ avx512_partial_qsort<TypeParam>(`
	`34`	`+ psortedarr.data(), k, psortedarr.size());`
`34`	`35`	`for (size_t jj = 0; jj < k; jj++) {`
`35`	`36`	`ASSERT_EQ(sortedarr[jj], psortedarr[jj]);`
`36`	`37`	`}`