From 1d249f5837a2efa204583463bb425883245d0c8e Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 26 Feb 2024 14:46:27 -0800 Subject: [PATCH 1/5] Use smart pivot in key-value --- src/avx512-64bit-common.h | 86 +++++++++++++++++++++++++++++++ src/avx512-64bit-keyvaluesort.hpp | 5 +- 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 1cd4ca1c..c529fae6 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -25,6 +25,7 @@ template X86_SIMD_SORT_INLINE reg_t sort_zmm_64bit(reg_t zmm); struct avx512_64bit_swizzle_ops; +struct avx512_ymm_64bit_swizzle_ops; template <> struct ymm_vector { @@ -34,6 +35,7 @@ struct ymm_vector { using opmask_t = __mmask8; static const uint8_t numlanes = 8; static constexpr simd_type vec_type = simd_type::AVX512; + using swizzle_ops = avx512_ymm_64bit_swizzle_ops; static type_t type_max() { @@ -208,6 +210,9 @@ struct ymm_vector { { return _mm256_castps_si256(v); } + static bool all_false(opmask_t k){ + return k == 0; + } static reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); @@ -222,6 +227,7 @@ struct ymm_vector { using opmask_t = __mmask8; static const uint8_t numlanes = 8; static constexpr simd_type vec_type = simd_type::AVX512; + using swizzle_ops = avx512_ymm_64bit_swizzle_ops; static type_t type_max() { @@ -382,6 +388,9 @@ struct ymm_vector { { return v; } + static bool all_false(opmask_t k){ + return k == 0; + } static reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); @@ -396,6 +405,7 @@ struct ymm_vector { using opmask_t = __mmask8; static const uint8_t numlanes = 8; static constexpr simd_type vec_type = simd_type::AVX512; + using swizzle_ops = avx512_ymm_64bit_swizzle_ops; static type_t type_max() { @@ -556,6 +566,9 @@ struct ymm_vector { { return v; } + static bool all_false(opmask_t k){ + return k == 0; + } static reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); @@ -1204,4 +1217,77 @@ struct avx512_64bit_swizzle_ops { } }; +struct avx512_ymm_64bit_swizzle_ops { + template + X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) + { + __m256i v = vtype::cast_to(reg); + + if constexpr (scale == 2) { + __m256 vf = _mm256_castsi256_ps(v); + vf = _mm256_permute_ps(vf, 0b10110001); + v = _mm256_castps_si256(vf); + } + else if constexpr (scale == 4) { + __m256 vf = _mm256_castsi256_ps(v); + vf = _mm256_permute_ps(vf, 0b01001110); + v = _mm256_castps_si256(vf); + } + else if constexpr (scale == 8) { + v = _mm256_permute2x128_si256(v, v, 0b00000001); + } + else { + static_assert(scale == -1, "should not be reached"); + } + + return vtype::cast_from(v); + } + + template + X86_SIMD_SORT_INLINE typename vtype::reg_t + reverse_n(typename vtype::reg_t reg) + { + __m256i v = vtype::cast_to(reg); + + if constexpr (scale == 2) { return swap_n(reg); } + else if constexpr (scale == 4) { + constexpr uint64_t mask = 0b00011011; + __m256 vf = _mm256_castsi256_ps(v); + vf = _mm256_permute_ps(vf, mask); + v = _mm256_castps_si256(vf); + } + else if constexpr (scale == 8) { + return vtype::reverse(reg); + } + else { + static_assert(scale == -1, "should not be reached"); + } + + return vtype::cast_from(v); + } + + template + X86_SIMD_SORT_INLINE typename vtype::reg_t + merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) + { + __m256i v1 = vtype::cast_to(reg); + __m256i v2 = vtype::cast_to(other); + + if constexpr (scale == 2) { + v1 = _mm256_blend_epi32(v1, v2, 0b01010101); + } + else if constexpr (scale == 4) { + v1 = _mm256_blend_epi32(v1, v2, 0b00110011); + } + else if constexpr (scale == 8) { + v1 = _mm256_blend_epi32(v1, v2, 0b00001111); + } + else { + static_assert(scale == -1, "should not be reached"); + } + + return vtype::cast_from(v1); + } +}; + #endif diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp index 9acdbd71..48b28ad7 100644 --- a/src/avx512-64bit-keyvaluesort.hpp +++ b/src/avx512-64bit-keyvaluesort.hpp @@ -388,7 +388,10 @@ X86_SIMD_SORT_INLINE void qsort_64bit_(type1_t *keys, return; } - type1_t pivot = get_pivot_blocks(keys, left, right); + type1_t pivot; + auto pivot_result = get_pivot_smart(keys, left, right); + pivot = pivot_result.pivot; + type1_t smallest = vtype1::type_max(); type1_t biggest = vtype1::type_min(); arrsize_t pivot_index = partition_avx512_unrolled( From bc77ec988979654d216288285353bf7828d2e702 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 27 Feb 2024 09:20:48 -0800 Subject: [PATCH 2/5] format fixes --- src/avx512-64bit-common.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index c529fae6..cbd38a32 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -210,7 +210,8 @@ struct ymm_vector { { return _mm256_castps_si256(v); } - static bool all_false(opmask_t k){ + static bool all_false(opmask_t k) + { return k == 0; } static reg_t reverse(reg_t ymm) @@ -388,7 +389,8 @@ struct ymm_vector { { return v; } - static bool all_false(opmask_t k){ + static bool all_false(opmask_t k) + { return k == 0; } static reg_t reverse(reg_t ymm) @@ -566,7 +568,8 @@ struct ymm_vector { { return v; } - static bool all_false(opmask_t k){ + static bool all_false(opmask_t k) + { return k == 0; } static reg_t reverse(reg_t ymm) From 8d16dd6b02cae91d3330c75369e3e5e07662af01 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 27 Feb 2024 14:57:14 -0800 Subject: [PATCH 3/5] Bug in rand_array.h --- utils/rand_array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/rand_array.h b/utils/rand_array.h index a9703551..dc20dbb9 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -137,7 +137,7 @@ static std::vector get_array(std::string arrtype, val = std::numeric_limits::max(); } for (size_t ii = 1; ii <= arrsize; ++ii) { - if (rand() % 0x1) { arr[ii] = val; } + if (rand() & 0x1) { arr[ii] = val; } } } else { From 24e8e3db9ab565de28a9b5514f5e9a7458da5831 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 27 Feb 2024 14:57:59 -0800 Subject: [PATCH 4/5] update to 5.0 --- meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 873094ba..1e2a913a 100644 --- a/meson.build +++ b/meson.build @@ -1,5 +1,5 @@ project('x86-simd-sort', 'cpp', - version : '4.0.0', + version : '5.0.0', license : 'BSD 3-clause', default_options : ['cpp_std=c++17']) fs = import('fs') From 3d47ffc1c1fb7d2231ac87881f89aa017c39b1dd Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 27 Feb 2024 15:04:07 -0800 Subject: [PATCH 5/5] Remove dead code --- src/xss-pivot-selection.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp index 59dc0489..13fed026 100644 --- a/src/xss-pivot-selection.hpp +++ b/src/xss-pivot-selection.hpp @@ -157,12 +157,7 @@ get_pivot_smart(type_t *arr, const arrsize_t left, const arrsize_t right) // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition return pivot_results(median); } - else { - // Should be unreachable - return pivot_results(median); - } - // Should be unreachable return pivot_results(median); }