From 0341e015cbedf863ab1e87ffdea7f7a522b7e6b3 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Mon, 15 Sep 2025 21:03:27 +0300 Subject: [PATCH 01/11] init --- Objects/listobject.c | 141 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 137 insertions(+), 4 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index 5905a6d335b311..9f5f6992562f2a 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1765,7 +1765,7 @@ struct s_MergeState { the input (nothing is lost or duplicated). */ static int -binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok) +binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, float adapt) { Py_ssize_t k; /* for IFLT macro expansion */ PyObject ** const a = ss->keys; @@ -1778,6 +1778,120 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok) /* assert a[:ok] is sorted */ if (! ok) ++ok; + + Py_ssize_t L, R; + /* Adaptive step */ + if (adapt) { + Py_ssize_t diff = ok; // jump (jump out on 1st loop to not kick in) + Py_ssize_t last = ok >> 1; // mid point (simple binary on 1st loop) + float ns = 5.0f; // number of successes (a bit of head start) + float seen = 0.0f; // number of loops done + // const float adapt = 1.3; // adaptivity strength + for (; ok < n && ns * adapt >= seen; ++ok) { + pivot = a[ok]; + + IFLT(pivot, a[last]) { + L = 0; + R = last; + if (L < R) { + if (diff == 0) + diff = 1; + M = R - diff; + if (M < L) + M = L; + IFLT(pivot, a[M]) { + R = M; + if (L < R) { + diff += 1; + M = R - diff; + if (M < L) + M = L; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + ns += (float)(R - L) * 8 < ok; + } + else { + ns += 2.0f; + } + } + else { + L = M + 1; + ns += (float)(R - L) * 4 < ok; + } + } + else { + ns += 2.0f; + } + } + else { + L = last + 1; + R = ok; + if (L < R) { + M = L + diff; + if (M >= R) + M = R - 1; + IFLT(pivot, a[M]) { + R = M; + ns += (float)(R - L) * 4 < ok; + } + else { + L = M + 1; + if (L < R) { + diff += 1; + M = L + diff; + if (M >= R) + M = R - 1; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + ns += (float)(R - L) * 8 < ok; + } + else { + ns += 2.0f; + } + } + } + else { + ns += 2.0f; + } + } + + // Binary Insertion + while (L < R) { + M = (L + R) >> 1; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + } + + for (M = ok; M > L; --M) + a[M] = a[M - 1]; + a[L] = pivot; + if (has_values) { + pivot = v[ok]; + for (M = ok; M > L; --M) + v[M] = v[M - 1]; + v[L] = pivot; + } + + // Update Adaptive runvars + diff = L - last; + if (diff < 0) + diff = -diff; + last = L; + seen += 1.0f; + } + if (ok >= n) { + // Successfully ran fully adaptive + // Else go to simple binary sort + return 1; + } + } + /* Regular insertion sort has average- and worst-case O(n**2) cost for both # of comparisons and number of bytes moved. But its branches are highly predictable, and it loves sorted input (n-1 compares and no @@ -1828,7 +1942,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok) v[M + 1] = vpivot; } #else // binary insertion sort - Py_ssize_t L, R; + for (; ok < n; ++ok) { /* set L to where a[ok] belongs */ L = 0; @@ -3074,6 +3188,9 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) /* March over the array once, left to right, finding natural runs, * and extending short natural runs to minrun elements. */ + int bres; + Py_ssize_t cs = 0; + Py_ssize_t cd = 1; do { Py_ssize_t n; @@ -3086,8 +3203,24 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) if (n < minrun) { const Py_ssize_t force = nremaining <= minrun ? nremaining : minrun; - if (binarysort(&ms, &lo, force, n) < 0) - goto fail; + if (cs) { + if (binarysort(&ms, &lo, force, n, 0.0) < 0) + goto fail; + cs -= 1; + } + else { + bres = binarysort(&ms, &lo, force, n, 1.3); + if (bres < 0) + goto fail; + if (bres) { + cd = 1; + } else { + cd += 2; + if (cd > 11) + cd = 11; + cs = cd; + } + } n = force; } /* Maybe merge pending runs. */ From e820eb7bd0230be5a2c26876db41a6e71b7b3334 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Mon, 15 Sep 2025 22:47:35 +0300 Subject: [PATCH 02/11] minor edit --- Objects/listobject.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index 9f5f6992562f2a..0fb27a27bbde6f 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1784,7 +1784,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, flo if (adapt) { Py_ssize_t diff = ok; // jump (jump out on 1st loop to not kick in) Py_ssize_t last = ok >> 1; // mid point (simple binary on 1st loop) - float ns = 5.0f; // number of successes (a bit of head start) + Py_ssize_t ns = 5; // number of successes (a bit of head start) float seen = 0.0f; // number of loops done // const float adapt = 1.3; // adaptivity strength for (; ok < n && ns * adapt >= seen; ++ok) { @@ -1802,7 +1802,6 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, flo IFLT(pivot, a[M]) { R = M; if (L < R) { - diff += 1; M = R - diff; if (M < L) M = L; @@ -1810,19 +1809,19 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, flo R = M; else L = M + 1; - ns += (float)(R - L) * 8 < ok; + ns += (R - L) * 8 < ok; } else { - ns += 2.0f; + ns += 2; } } else { L = M + 1; - ns += (float)(R - L) * 4 < ok; + ns += (R - L) * 4 < ok; } } else { - ns += 2.0f; + ns += 2; } } else { @@ -1834,12 +1833,11 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, flo M = R - 1; IFLT(pivot, a[M]) { R = M; - ns += (float)(R - L) * 4 < ok; + ns += (R - L) * 4 < ok; } else { L = M + 1; if (L < R) { - diff += 1; M = L + diff; if (M >= R) M = R - 1; @@ -1847,15 +1845,15 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, flo R = M; else L = M + 1; - ns += (float)(R - L) * 8 < ok; + ns += (R - L) * 8 < ok; } else { - ns += 2.0f; + ns += 2; } } } else { - ns += 2.0f; + ns += 2; } } @@ -1887,9 +1885,9 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, flo } if (ok >= n) { // Successfully ran fully adaptive - // Else go to simple binary sort return 1; } + // Else go to simple binary sort } /* Regular insertion sort has average- and worst-case O(n**2) cost From 93b69cd6f5b4078d59db059dfe190e83ad838185 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Tue, 16 Sep 2025 22:19:38 +0300 Subject: [PATCH 03/11] v2 --- Objects/listobject.c | 369 +++++++++++++++++++++++++------------------ 1 file changed, 211 insertions(+), 158 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index 0fb27a27bbde6f..b7d43d3b2b3de7 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1765,7 +1765,7 @@ struct s_MergeState { the input (nothing is lost or duplicated). */ static int -binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, float adapt) +binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok) { Py_ssize_t k; /* for IFLT macro expansion */ PyObject ** const a = ss->keys; @@ -1778,118 +1778,6 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, flo /* assert a[:ok] is sorted */ if (! ok) ++ok; - - Py_ssize_t L, R; - /* Adaptive step */ - if (adapt) { - Py_ssize_t diff = ok; // jump (jump out on 1st loop to not kick in) - Py_ssize_t last = ok >> 1; // mid point (simple binary on 1st loop) - Py_ssize_t ns = 5; // number of successes (a bit of head start) - float seen = 0.0f; // number of loops done - // const float adapt = 1.3; // adaptivity strength - for (; ok < n && ns * adapt >= seen; ++ok) { - pivot = a[ok]; - - IFLT(pivot, a[last]) { - L = 0; - R = last; - if (L < R) { - if (diff == 0) - diff = 1; - M = R - diff; - if (M < L) - M = L; - IFLT(pivot, a[M]) { - R = M; - if (L < R) { - M = R - diff; - if (M < L) - M = L; - IFLT(pivot, a[M]) - R = M; - else - L = M + 1; - ns += (R - L) * 8 < ok; - } - else { - ns += 2; - } - } - else { - L = M + 1; - ns += (R - L) * 4 < ok; - } - } - else { - ns += 2; - } - } - else { - L = last + 1; - R = ok; - if (L < R) { - M = L + diff; - if (M >= R) - M = R - 1; - IFLT(pivot, a[M]) { - R = M; - ns += (R - L) * 4 < ok; - } - else { - L = M + 1; - if (L < R) { - M = L + diff; - if (M >= R) - M = R - 1; - IFLT(pivot, a[M]) - R = M; - else - L = M + 1; - ns += (R - L) * 8 < ok; - } - else { - ns += 2; - } - } - } - else { - ns += 2; - } - } - - // Binary Insertion - while (L < R) { - M = (L + R) >> 1; - IFLT(pivot, a[M]) - R = M; - else - L = M + 1; - } - - for (M = ok; M > L; --M) - a[M] = a[M - 1]; - a[L] = pivot; - if (has_values) { - pivot = v[ok]; - for (M = ok; M > L; --M) - v[M] = v[M - 1]; - v[L] = pivot; - } - - // Update Adaptive runvars - diff = L - last; - if (diff < 0) - diff = -diff; - last = L; - seen += 1.0f; - } - if (ok >= n) { - // Successfully ran fully adaptive - return 1; - } - // Else go to simple binary sort - } - /* Regular insertion sort has average- and worst-case O(n**2) cost for both # of comparisons and number of bytes moved. But its branches are highly predictable, and it loves sorted input (n-1 compares and no @@ -1940,7 +1828,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, flo v[M + 1] = vpivot; } #else // binary insertion sort - + Py_ssize_t L, R; for (; ok < n; ++ok) { /* set L to where a[ok] belongs */ L = 0; @@ -1998,6 +1886,148 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, flo return -1; } +static Py_ssize_t +abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, int adapt) +{ + Py_ssize_t k; /* for IFLT macro expansion */ + PyObject ** const a = ss->keys; + PyObject ** const v = ss->values; + const bool has_values = v != NULL; + PyObject *pivot; + + assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN); + /* assert a[:ok] is sorted */ + if (! ok) + ++ok; + + Py_ssize_t M, L, R; + Py_ssize_t nsorted = ok; + Py_ssize_t diff_new; + Py_ssize_t diff = ok; // jump (jump out on 1st loop to not kick in) + Py_ssize_t last = ok >> 1; // mid point (simple binary on 1st loop) + Py_ssize_t ns = 0; // number of successes (a bit of head start) + + if (adapt) { + for (; ok < n; ++ok) { + pivot = a[ok]; + + IFLT(pivot, a[last]) { + L = 0; + R = last; + if (L < R) { + // To not affect diff for measure counting + diff_new = diff + (diff == 0); + M = R - diff_new; + if (M < L) + M = L; + IFLT(pivot, a[M]) { + R = M; + if (L < R) { + M = R - diff_new; + if (M < L) + M = L; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + } + } + else { + L = M + 1; + } + } + } + else { + L = last + 1; + R = ok; + if (L < R) { + M = L + diff; + if (M >= R) + M = R - 1; + IFLT(pivot, a[M]) { + R = M; + } + else { + L = M + 1; + if (L < R) { + M = L + diff; + if (M >= R) + M = R - 1; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + } + } + } + } + + // Binary Insertion + while (L < R) { + M = (L + R) >> 1; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + } + + for (M = ok; M > L; --M) + a[M] = a[M - 1]; + a[L] = pivot; + if (has_values) { + pivot = v[ok]; + for (M = ok; M > L; --M) + v[M] = v[M - 1]; + v[L] = pivot; + } + + // Update Adaptive runvars + diff_new = L < last ? last - L : L - last; + ns += diff_new < diff ? diff - diff_new : diff_new - diff; + diff = diff_new; + last = L; + } + } + else { + for (; ok < n; ++ok) { + pivot = a[ok]; + L = 0; + R = ok; + + // Binary Insertion + while (L < R) { + M = (L + R) >> 1; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + } + + for (M = ok; M > L; --M) + a[M] = a[M - 1]; + a[L] = pivot; + if (has_values) { + pivot = v[ok]; + for (M = ok; M > L; --M) + v[M] = v[M - 1]; + v[L] = pivot; + } + + // Update Adaptive runvars + diff_new = L < last ? last - L : L - last; + ns += diff_new < diff ? diff - diff_new : diff_new - diff; + diff = diff_new; + last = L; + } + } + + // Return Adaptivity measure (max 1000) + return ns * 2000 / ((n + 2 * nsorted - 1) * n); + + fail: + return -1; +} + static void sortslice_reverse(sortslice *s, Py_ssize_t n) { @@ -3186,55 +3216,78 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) /* March over the array once, left to right, finding natural runs, * and extending short natural runs to minrun elements. */ - int bres; - Py_ssize_t cs = 0; - Py_ssize_t cd = 1; - do { - Py_ssize_t n; - - /* Identify next run. */ - n = count_run(&ms, &lo, nremaining); - if (n < 0) - goto fail; - /* If short, extend to min(minrun, nremaining). */ - minrun = minrun_next(&ms); - if (n < minrun) { - const Py_ssize_t force = nremaining <= minrun ? - nremaining : minrun; - if (cs) { - if (binarysort(&ms, &lo, force, n, 0.0) < 0) + int binary_adapt = 1; + // NOTE: Could turn on based on minlen or comparison type + if (binary_adapt) { + int adapt = 0; // do not run binarysort adaptivity on 1st run + do { + /* Identify next run. */ + Py_ssize_t n; + n = count_run(&ms, &lo, nremaining); + if (n < 0) + goto fail; + /* If short, extend to min(minrun, nremaining). */ + minrun = minrun_next(&ms); + if (n < minrun) { + const Py_ssize_t force = nremaining <= minrun ? + nremaining : minrun; + Py_ssize_t bres; + bres = abinarysort(&ms, &lo, force, n, adapt); + if (bres < 0) goto fail; - cs -= 1; + adapt = bres < 125; + n = force; } else { - bres = binarysort(&ms, &lo, force, n, 1.3); - if (bres < 0) + // After long monotonic run start adapting immediately + adapt = 1; + } + /* Maybe merge pending runs. */ + assert(ms.n == 0 || ms.pending[ms.n -1].base.keys + + ms.pending[ms.n-1].len == lo.keys); + if (found_new_run(&ms, n) < 0) + goto fail; + /* Push new run on stack. */ + assert(ms.n < MAX_MERGE_PENDING); + ms.pending[ms.n].base = lo; + ms.pending[ms.n].len = n; + ++ms.n; + /* Advance to find next run. */ + sortslice_advance(&lo, n); + nremaining -= n; + } while (nremaining); + } + else { + do { + /* Identify next run. */ + Py_ssize_t n; + n = count_run(&ms, &lo, nremaining); + if (n < 0) + goto fail; + /* If short, extend to min(minrun, nremaining). */ + minrun = minrun_next(&ms); + if (n < minrun) { + const Py_ssize_t force = nremaining <= minrun ? + nremaining : minrun; + if (binarysort(&ms, &lo, force, n) < 0) goto fail; - if (bres) { - cd = 1; - } else { - cd += 2; - if (cd > 11) - cd = 11; - cs = cd; - } + n = force; } - n = force; - } - /* Maybe merge pending runs. */ - assert(ms.n == 0 || ms.pending[ms.n -1].base.keys + - ms.pending[ms.n-1].len == lo.keys); - if (found_new_run(&ms, n) < 0) - goto fail; - /* Push new run on stack. */ - assert(ms.n < MAX_MERGE_PENDING); - ms.pending[ms.n].base = lo; - ms.pending[ms.n].len = n; - ++ms.n; - /* Advance to find next run. */ - sortslice_advance(&lo, n); - nremaining -= n; - } while (nremaining); + /* Maybe merge pending runs. */ + assert(ms.n == 0 || ms.pending[ms.n -1].base.keys + + ms.pending[ms.n-1].len == lo.keys); + if (found_new_run(&ms, n) < 0) + goto fail; + /* Push new run on stack. */ + assert(ms.n < MAX_MERGE_PENDING); + ms.pending[ms.n].base = lo; + ms.pending[ms.n].len = n; + ++ms.n; + /* Advance to find next run. */ + sortslice_advance(&lo, n); + nremaining -= n; + } while (nremaining); + } if (merge_force_collapse(&ms) < 0) goto fail; From 71c82af116bb71f4a4ec5c46103cba7b7316138d Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Wed, 17 Sep 2025 01:40:52 +0300 Subject: [PATCH 04/11] v3 --- Objects/listobject.c | 70 ++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index b7d43d3b2b3de7..7b546810c93a14 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1902,28 +1902,28 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in Py_ssize_t M, L, R; Py_ssize_t nsorted = ok; - Py_ssize_t diff_new; - Py_ssize_t diff = ok; // jump (jump out on 1st loop to not kick in) - Py_ssize_t last = ok >> 1; // mid point (simple binary on 1st loop) - Py_ssize_t ns = 0; // number of successes (a bit of head start) + Py_ssize_t last = ok >> 1; + Py_ssize_t std = ok >> 2; + Py_ssize_t mu = last; + Py_ssize_t nb = 0; // badness of fit if (adapt) { for (; ok < n; ++ok) { pivot = a[ok]; - IFLT(pivot, a[last]) { + IFLT(pivot, a[mu]) { L = 0; - R = last; + R = mu; if (L < R) { // To not affect diff for measure counting - diff_new = diff + (diff == 0); - M = R - diff_new; + std += (std == 0); + M = R - std; if (M < L) M = L; IFLT(pivot, a[M]) { R = M; if (L < R) { - M = R - diff_new; + M = R - std; if (M < L) M = L; IFLT(pivot, a[M]) @@ -1938,10 +1938,10 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in } } else { - L = last + 1; + L = mu + 1; R = ok; if (L < R) { - M = L + diff; + M = L + std; if (M >= R) M = R - 1; IFLT(pivot, a[M]) { @@ -1950,7 +1950,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in else { L = M + 1; if (L < R) { - M = L + diff; + M = L + std; if (M >= R) M = R - 1; IFLT(pivot, a[M]) @@ -1982,9 +1982,10 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in } // Update Adaptive runvars - diff_new = L < last ? last - L : L - last; - ns += diff_new < diff ? diff - diff_new : diff_new - diff; - diff = diff_new; + std = L < mu ? mu - L : L - mu; + nb += std; + mu = L + L - last; + mu = mu < 0 ? 0 : mu > ok ? ok : mu; last = L; } } @@ -2014,15 +2015,16 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in } // Update Adaptive runvars - diff_new = L < last ? last - L : L - last; - ns += diff_new < diff ? diff - diff_new : diff_new - diff; - diff = diff_new; + std = L < mu ? mu - L : L - mu; + nb += std; + mu = L + L - last; + mu = mu < 0 ? 0 : mu > ok ? ok : mu; last = L; } } // Return Adaptivity measure (max 1000) - return ns * 2000 / ((n + 2 * nsorted - 1) * n); + return nb * 2000 / ((n + 2 * nsorted - 1) * n); fail: return -1; @@ -3216,10 +3218,12 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) /* March over the array once, left to right, finding natural runs, * and extending short natural runs to minrun elements. */ - int binary_adapt = 1; // NOTE: Could turn on based on minlen or comparison type + int binary_adapt = ms.listlen >= 100; if (binary_adapt) { int adapt = 0; // do not run binarysort adaptivity on 1st run + Py_ssize_t cs = 0; + Py_ssize_t cd = 1; do { /* Identify next run. */ Py_ssize_t n; @@ -3231,11 +3235,27 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) if (n < minrun) { const Py_ssize_t force = nremaining <= minrun ? nremaining : minrun; - Py_ssize_t bres; - bres = abinarysort(&ms, &lo, force, n, adapt); - if (bres < 0) - goto fail; - adapt = bres < 125; + if (cs) { + if (binarysort(&ms, &lo, force, n) < 0) + goto fail; + cs -= 1; + } + else { + Py_ssize_t bres; + bres = abinarysort(&ms, &lo, force, n, adapt); + if (bres < 0) + goto fail; + adapt = bres < 250; + if (adapt) { + cd = 1; + } + else { + cd += 2; + if (cd > 11) + cd = 11; + cs = cd; + } + } n = force; } else { From 6e0269ccb43f3eaab49cdcff3fa796eec7934f31 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Wed, 17 Sep 2025 03:09:39 +0300 Subject: [PATCH 05/11] minor changes --- Objects/listobject.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index 7b546810c93a14..b9f968c734107a 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1905,7 +1905,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in Py_ssize_t last = ok >> 1; Py_ssize_t std = ok >> 2; Py_ssize_t mu = last; - Py_ssize_t nb = 0; // badness of fit + Py_ssize_t nbad = 0; // badness of fit if (adapt) { for (; ok < n; ++ok) { @@ -1983,7 +1983,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in // Update Adaptive runvars std = L < mu ? mu - L : L - mu; - nb += std; + nbad += std; mu = L + L - last; mu = mu < 0 ? 0 : mu > ok ? ok : mu; last = L; @@ -2016,7 +2016,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in // Update Adaptive runvars std = L < mu ? mu - L : L - mu; - nb += std; + nbad += std; mu = L + L - last; mu = mu < 0 ? 0 : mu > ok ? ok : mu; last = L; @@ -2024,7 +2024,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in } // Return Adaptivity measure (max 1000) - return nb * 2000 / ((n + 2 * nsorted - 1) * n); + return nbad * 2000 / ((n + 2 * nsorted - 1) * n); fail: return -1; @@ -3221,9 +3221,10 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) // NOTE: Could turn on based on minlen or comparison type int binary_adapt = ms.listlen >= 100; if (binary_adapt) { - int adapt = 0; // do not run binarysort adaptivity on 1st run - Py_ssize_t cs = 0; + int adapt = 0; // do not run binarysort adaptivity on 1st run + Py_ssize_t cs = 0; // but do check goodness of adaptive fit Py_ssize_t cd = 1; + Py_ssize_t abinres; do { /* Identify next run. */ Py_ssize_t n; @@ -3241,26 +3242,24 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) cs -= 1; } else { - Py_ssize_t bres; - bres = abinarysort(&ms, &lo, force, n, adapt); - if (bres < 0) + abinres = abinarysort(&ms, &lo, force, n, adapt); + if (abinres < 0) goto fail; - adapt = bres < 250; - if (adapt) { + adapt = abinres < 250; + if (adapt) cd = 1; - } - else { - cd += 2; - if (cd > 11) - cd = 11; - cs = cd; - } + else if (cd >= 9) + cs = cd = 11; + else + cs = cd = cd + 2; } n = force; } else { // After long monotonic run start adapting immediately adapt = 1; + cs = 0; + cd = 1; } /* Maybe merge pending runs. */ assert(ms.n == 0 || ms.pending[ms.n -1].base.keys + From 827d48e23a36b5417c60736cdedfe6c5fd1620bd Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Wed, 17 Sep 2025 19:28:43 +0300 Subject: [PATCH 06/11] minor edits --- Objects/listobject.c | 72 ++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index b9f968c734107a..f955a101857661 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1886,7 +1886,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok) return -1; } -static Py_ssize_t +static int abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, int adapt) { Py_ssize_t k; /* for IFLT macro expansion */ @@ -1982,7 +1982,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in } // Update Adaptive runvars - std = L < mu ? mu - L : L - mu; + std = labs(L - mu); nbad += std; mu = L + L - last; mu = mu < 0 ? 0 : mu > ok ? ok : mu; @@ -2015,7 +2015,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in } // Update Adaptive runvars - std = L < mu ? mu - L : L - mu; + std = labs(L - mu); nbad += std; mu = L + L - last; mu = mu < 0 ? 0 : mu > ok ? ok : mu; @@ -3220,11 +3220,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) */ // NOTE: Could turn on based on minlen or comparison type int binary_adapt = ms.listlen >= 100; - if (binary_adapt) { - int adapt = 0; // do not run binarysort adaptivity on 1st run - Py_ssize_t cs = 0; // but do check goodness of adaptive fit - Py_ssize_t cd = 1; - Py_ssize_t abinres; + if (!binary_adapt) { do { /* Identify next run. */ Py_ssize_t n; @@ -3236,31 +3232,10 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) if (n < minrun) { const Py_ssize_t force = nremaining <= minrun ? nremaining : minrun; - if (cs) { - if (binarysort(&ms, &lo, force, n) < 0) - goto fail; - cs -= 1; - } - else { - abinres = abinarysort(&ms, &lo, force, n, adapt); - if (abinres < 0) - goto fail; - adapt = abinres < 250; - if (adapt) - cd = 1; - else if (cd >= 9) - cs = cd = 11; - else - cs = cd = cd + 2; - } + if (binarysort(&ms, &lo, force, n) < 0) + goto fail; n = force; } - else { - // After long monotonic run start adapting immediately - adapt = 1; - cs = 0; - cd = 1; - } /* Maybe merge pending runs. */ assert(ms.n == 0 || ms.pending[ms.n -1].base.keys + ms.pending[ms.n-1].len == lo.keys); @@ -3277,6 +3252,16 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) } while (nremaining); } else { + // NOTE:WIP: Only 1% out of 6% worst case is due to + // extra calculations in simple binary sort + // removing big branch in `abinarysort` also has not effect + // this has something to do with higher level branch prediction + // doing if (0) removes only 1% extra == 2% + // and commenting out code still 2% slower...??? + int adapt = 0; // do not run binarysort adaptivity on 1st run + int cs = 0; // but do check goodness of adaptive fit + int cd = 1; + int abinret; do { /* Identify next run. */ Py_ssize_t n; @@ -3288,10 +3273,31 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) if (n < minrun) { const Py_ssize_t force = nremaining <= minrun ? nremaining : minrun; - if (binarysort(&ms, &lo, force, n) < 0) - goto fail; + if (cs) { + if (binarysort(&ms, &lo, force, n) < 0) + goto fail; + cs -= 1; + } + else { + abinret = abinarysort(&ms, &lo, force, n, adapt); + if (abinret < 0) + goto fail; + adapt = abinret < 250; + if (adapt) + cd = 1; + else if (cd >= 9) + cs = cd = 11; + else + cs = cd = cd + 2; + } n = force; } + else { + // After long monotonic run start adapting immediately + adapt = 1; + cs = 0; + cd = 1; + } /* Maybe merge pending runs. */ assert(ms.n == 0 || ms.pending[ms.n -1].base.keys + ms.pending[ms.n-1].len == lo.keys); From 589572fb98d1e51a817f1cf5692283573b13e781 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Wed, 17 Sep 2025 19:30:44 +0300 Subject: [PATCH 07/11] micro change --- Objects/listobject.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index f955a101857661..39073acacc1d27 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -3222,8 +3222,9 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) int binary_adapt = ms.listlen >= 100; if (!binary_adapt) { do { - /* Identify next run. */ Py_ssize_t n; + + /* Identify next run. */ n = count_run(&ms, &lo, nremaining); if (n < 0) goto fail; @@ -3263,8 +3264,9 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) int cd = 1; int abinret; do { - /* Identify next run. */ Py_ssize_t n; + + /* Identify next run. */ n = count_run(&ms, &lo, nremaining); if (n < 0) goto fail; From 992b26f04bb0676367a476c4b33d35f7e70345be Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sat, 20 Sep 2025 02:23:42 +0300 Subject: [PATCH 08/11] minimum std condition added --- Objects/listobject.c | 137 +++++++++++++++++++++++++------------------ 1 file changed, 80 insertions(+), 57 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index 39073acacc1d27..2c38103ca63a1c 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1903,72 +1903,92 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in Py_ssize_t M, L, R; Py_ssize_t nsorted = ok; Py_ssize_t last = ok >> 1; - Py_ssize_t std = ok >> 2; + Py_ssize_t std = last; Py_ssize_t mu = last; - Py_ssize_t nbad = 0; // badness of fit + Py_ssize_t nbad = 0; // badness of fit if (adapt) { for (; ok < n; ++ok) { pivot = a[ok]; - IFLT(pivot, a[mu]) { - L = 0; - R = mu; - if (L < R) { - // To not affect diff for measure counting - std += (std == 0); - M = R - std; - if (M < L) - M = L; - IFLT(pivot, a[M]) { - R = M; - if (L < R) { - M = R - std; - if (M < L) - M = L; - IFLT(pivot, a[M]) - R = M; - else - L = M + 1; + if (std < ok / 4) { + M = mu; + IFLT(pivot, a[M]) { + L = 0; + R = M; + if (L < R) { + std += !std; + M = R - std; + if (M < L) + M = L; + IFLT(pivot, a[M]) { + R = M; + if (L < R) { + M = R - std; + if (M < L) + M = L; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + } + } + else { + L = M + 1; } } - else { - L = M + 1; + } + else { + L = M + 1; + R = ok; + if (L < R) { + M = L + std; + if (M >= R) + M = R - 1; + IFLT(pivot, a[M]) { + R = M; + } + else { + L = M + 1; + if (L < R) { + M = L + std; + if (M >= R) + M = R - 1; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + } + } } } - } - else { - L = mu + 1; - R = ok; - if (L < R) { - M = L + std; - if (M >= R) - M = R - 1; - IFLT(pivot, a[M]) { + // Binary Insertion + while (L < R) { + M = (L + R) >> 1; + IFLT(pivot, a[M]) R = M; - } - else { + else L = M + 1; - if (L < R) { - M = L + std; - if (M >= R) - M = R - 1; - IFLT(pivot, a[M]) - R = M; - else - L = M + 1; - } - } } } - - // Binary Insertion - while (L < R) { - M = (L + R) >> 1; - IFLT(pivot, a[M]) + else { + // Binary Insertion + M = ok >> 1; + IFLT(pivot, a[M]) { + L = 0; R = M; - else + } + else { L = M + 1; + R = ok; + } + while (L < R) { + M = (L + R) >> 1; + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; + } } for (M = ok; M > L; --M) @@ -1992,10 +2012,17 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in else { for (; ok < n; ++ok) { pivot = a[ok]; - L = 0; - R = ok; // Binary Insertion + M = ok >> 1; + IFLT(pivot, a[M]) { + L = 0; + R = M; + } + else { + L = M + 1; + R = ok; + } while (L < R) { M = (L + R) >> 1; IFLT(pivot, a[M]) @@ -3253,12 +3280,8 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) } while (nremaining); } else { - // NOTE:WIP: Only 1% out of 6% worst case is due to + // NOTE:WIP: Only 1% difference is due to // extra calculations in simple binary sort - // removing big branch in `abinarysort` also has not effect - // this has something to do with higher level branch prediction - // doing if (0) removes only 1% extra == 2% - // and commenting out code still 2% slower...??? int adapt = 0; // do not run binarysort adaptivity on 1st run int cs = 0; // but do check goodness of adaptive fit int cd = 1; From 984bcd0fc400abfcfa9c5c59fa2513723ebfc1a7 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sat, 20 Sep 2025 03:17:05 +0300 Subject: [PATCH 09/11] bit shift instead of div --- Objects/listobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index 2c38103ca63a1c..b8895c18da67d7 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1911,7 +1911,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in for (; ok < n; ++ok) { pivot = a[ok]; - if (std < ok / 4) { + if (std < (ok >> 2)) { M = mu; IFLT(pivot, a[M]) { L = 0; From 7f30b554767b1ee7d09cfc31484019df743904a5 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sun, 21 Sep 2025 01:56:48 +0300 Subject: [PATCH 10/11] simplified to mu=j --- Objects/listobject.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index b8895c18da67d7..7e9d54e0220845 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1902,20 +1902,21 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in Py_ssize_t M, L, R; Py_ssize_t nsorted = ok; - Py_ssize_t last = ok >> 1; - Py_ssize_t std = last; - Py_ssize_t mu = last; + Py_ssize_t mu = ok >> 1; + Py_ssize_t std = mu; Py_ssize_t nbad = 0; // badness of fit if (adapt) { for (; ok < n; ++ok) { pivot = a[ok]; + // NOTE: If abinarysort is actually working, there will be gains + // And this is a relatively small insurance against adversity + // However, subject to be removed if not helpful in practice if (std < (ok >> 2)) { - M = mu; - IFLT(pivot, a[M]) { + IFLT(pivot, a[mu]) { L = 0; - R = M; + R = mu; if (L < R) { std += !std; M = R - std; @@ -1939,7 +1940,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in } } else { - L = M + 1; + L = mu + 1; R = ok; if (L < R) { M = L + std; @@ -2004,9 +2005,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in // Update Adaptive runvars std = labs(L - mu); nbad += std; - mu = L + L - last; - mu = mu < 0 ? 0 : mu > ok ? ok : mu; - last = L; + mu = L; } } else { @@ -2044,9 +2043,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in // Update Adaptive runvars std = labs(L - mu); nbad += std; - mu = L + L - last; - mu = mu < 0 ? 0 : mu > ok ? ok : mu; - last = L; + mu = L; } } From 2ff75f8f7aa650800be39e0c11c41f748565da75 Mon Sep 17 00:00:00 2001 From: "d.grigonis" Date: Sun, 21 Sep 2025 18:43:56 +0300 Subject: [PATCH 11/11] noop --- Objects/listobject.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Objects/listobject.c b/Objects/listobject.c index 7e9d54e0220845..c17a834435a7e0 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -2048,6 +2048,7 @@ abinarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok, in } // Return Adaptivity measure (max 1000) + // This is: 1000 * nbad / sum(range(nsorted:n)) return nbad * 2000 / ((n + 2 * nsorted - 1) * n); fail: @@ -3277,8 +3278,6 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) } while (nremaining); } else { - // NOTE:WIP: Only 1% difference is due to - // extra calculations in simple binary sort int adapt = 0; // do not run binarysort adaptivity on 1st run int cs = 0; // but do check goodness of adaptive fit int cd = 1;