@@ -1628,6 +1628,15 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
16281628/* Avoid malloc for small temp arrays. */
16291629#define MERGESTATE_TEMP_SIZE 256
16301630
1631+ /* The largest value of minrun. This must be a power of 2, and >= 1, so that
1632+ * the compute_minrun() algorithm guarantees to return a result no larger than
1633+ * this,
1634+ */
1635+ #define MAX_MINRUN 64
1636+ #if ((MAX_MINRUN ) < 1 ) || ((MAX_MINRUN ) & ((MAX_MINRUN ) - 1 ))
1637+ #error "MAX_MINRUN must be a power of 2, and >= 1"
1638+ #endif
1639+
16311640/* One MergeState exists on the stack per invocation of mergesort. It's just
16321641 * a convenient way to pass state around among the helper functions.
16331642 */
@@ -1685,68 +1694,133 @@ struct s_MergeState {
16851694 int (* tuple_elem_compare )(PyObject * , PyObject * , MergeState * );
16861695};
16871696
1688- /* binarysort is the best method for sorting small arrays: it does
1689- few compares, but can do data movement quadratic in the number of
1690- elements.
1691- [lo.keys, hi) is a contiguous slice of a list of keys, and is sorted via
1692- binary insertion. This sort is stable.
1693- On entry, must have lo.keys <= start <= hi, and that
1694- [lo.keys, start) is already sorted (pass start == lo.keys if you don't
1695- know!).
1696- If islt() complains return -1, else 0.
1697+ /* binarysort is the best method for sorting small arrays: it does few
1698+ compares, but can do data movement quadratic in the number of elements.
1699+ ss->keys is viewed as an array of n kays, a[:n]. a[:ok] is already sorted.
1700+ Pass ok = 0 (or 1) if you don't know.
1701+ It's sorted in-place, by a stable binary insertion sort. If ss->values
1702+ isn't NULL, it's permuted in lockstap with ss->keys.
1703+ On entry, must have n >= 1, and 0 <= ok <= n <= MAX_MINRUN.
1704+ Return -1 if comparison raises an exception, else 0.
16971705 Even in case of error, the output slice will be some permutation of
16981706 the input (nothing is lost or duplicated).
16991707*/
17001708static int
1701- binarysort (MergeState * ms , sortslice lo , PyObject * * hi , PyObject * * start )
1709+ binarysort (MergeState * ms , const sortslice * ss , Py_ssize_t n , Py_ssize_t ok )
17021710{
1703- Py_ssize_t k ;
1704- PyObject * * l , * * p , * * r ;
1711+ Py_ssize_t k ; /* for IFLT macro expansion */
1712+ PyObject * * const a = ss -> keys ;
1713+ PyObject * * const v = ss -> values ;
1714+ const bool has_values = v != NULL ;
17051715 PyObject * pivot ;
1706-
1707- assert (lo .keys <= start && start <= hi );
1708- /* assert [lo.keys, start) is sorted */
1709- if (lo .keys == start )
1710- ++ start ;
1711- for (; start < hi ; ++ start ) {
1712- /* set l to where *start belongs */
1713- l = lo .keys ;
1714- r = start ;
1715- pivot = * r ;
1716- /* Invariants:
1717- * pivot >= all in [lo.keys, l).
1718- * pivot < all in [r, start).
1719- * These are vacuously true at the start.
1716+ Py_ssize_t M ;
1717+
1718+ assert (0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN );
1719+ /* assert a[:ok] is sorted */
1720+ if (! ok )
1721+ ++ ok ;
1722+ /* Regular insertion sort has average- and worst-case O(n**2) cost
1723+ for both # of comparisons and number of bytes moved. But its branches
1724+ are highly predictable, and it loves sorted input (n-1 compares and no
1725+ data movement). This is significant in cases like sortperf.py's %sort,
1726+ where an out-of-order element near the start of a run is moved into
1727+ place slowly but then the remaining elements up to length minrun are
1728+ generally at worst one slot away from their correct position (so only
1729+ need 1 or 2 commpares to resolve). If comparisons are very fast (such
1730+ as for a list of Python floats), the simple inner loop leaves it
1731+ very competitive with binary insertion, despite that it does
1732+ significantly more compares overall on random data.
1733+
1734+ Binary insertion sort has worst, average, and best case O(n log n)
1735+ cost for # of comparisons, but worst and average case O(n**2) cost
1736+ for data movement. The more expensive comparisons, the more important
1737+ the comparison advantage. But its branches are less predictable the
1738+ more "randomish" the data, and that's so significant its worst case
1739+ in real life is random input rather than reverse-ordered (which does
1740+ about twice the data movement than random input does).
1741+
1742+ Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN
1743+ of 64 is so small that the key and value pointers all fit in a corner
1744+ of L1 cache, and moving things around in that is very fast. */
1745+ #if 0 // ordinary insertion sort.
1746+ PyObject * vpivot = NULL ;
1747+ for (; ok < n ; ++ ok ) {
1748+ pivot = a [ok ];
1749+ if (has_values )
1750+ vpivot = v [ok ];
1751+ for (M = ok - 1 ; M >= 0 ; -- M ) {
1752+ k = ISLT (pivot , a [M ]);
1753+ if (k < 0 ) {
1754+ a [M + 1 ] = pivot ;
1755+ if (has_values )
1756+ v [M + 1 ] = vpivot ;
1757+ goto fail ;
1758+ }
1759+ else if (k ) {
1760+ a [M + 1 ] = a [M ];
1761+ if (has_values )
1762+ v [M + 1 ] = v [M ];
1763+ }
1764+ else
1765+ break ;
1766+ }
1767+ a [M + 1 ] = pivot ;
1768+ if (has_values )
1769+ v [M + 1 ] = vpivot ;
1770+ }
1771+ #else // binary insertion sort
1772+ Py_ssize_t L , R ;
1773+ for (; ok < n ; ++ ok ) {
1774+ /* set L to where a[ok] belongs */
1775+ L = 0 ;
1776+ R = ok ;
1777+ pivot = a [ok ];
1778+ /* Slice invariants. vacuously true at the start:
1779+ * all a[0:L] <= pivot
1780+ * all a[L:R] unknown
1781+ * all a[R:ok] > pivot
17201782 */
1721- assert (l < r );
1783+ assert (L < R );
17221784 do {
1723- p = l + ((r - l ) >> 1 );
1724- IFLT (pivot , * p )
1725- r = p ;
1785+ /* don't do silly ;-) things to prevent overflow when finding
1786+ the midpoint; L and R are very far from filling a Py_ssize_t */
1787+ M = (L + R ) >> 1 ;
1788+ #if 1 // straightforward, but highly unpredictable branch on random data
1789+ IFLT (pivot , a [M ])
1790+ R = M ;
17261791 else
1727- l = p + 1 ;
1728- } while (l < r );
1729- assert (l == r );
1730- /* The invariants still hold, so pivot >= all in [lo.keys, l) and
1731- pivot < all in [l, start), so pivot belongs at l. Note
1732- that if there are elements equal to pivot, l points to the
1733- first slot after them -- that's why this sort is stable.
1734- Slide over to make room.
1735- Caution: using memmove is much slower under MSVC 5;
1736- we're not usually moving many slots. */
1737- for (p = start ; p > l ; -- p )
1738- * p = * (p - 1 );
1739- * l = pivot ;
1740- if (lo .values != NULL ) {
1741- Py_ssize_t offset = lo .values - lo .keys ;
1742- p = start + offset ;
1743- pivot = * p ;
1744- l += offset ;
1745- for ( ; p > l ; -- p )
1746- * p = * (p - 1 );
1747- * l = pivot ;
1792+ L = M + 1 ;
1793+ #else
1794+ /* Try to get compiler to generate conditional move instructions
1795+ instead. Works fine, but leaving it disabled for now because
1796+ it's not yielding consistently faster sorts. Needs more
1797+ investigation. More computation in the inner loop adds its own
1798+ costs, which can be significant when compares are fast. */
1799+ k = ISLT (pivot , a [M ]);
1800+ if (k < 0 )
1801+ goto fail ;
1802+ Py_ssize_t Mp1 = M + 1 ;
1803+ R = k ? M : R ;
1804+ L = k ? L : Mp1 ;
1805+ #endif
1806+ } while (L < R );
1807+ assert (L == R );
1808+ /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
1809+ at index L. Slide a[L:ok] to the right a slot to make room for it.
1810+ Caution: using memmove is much slower under MSVC 5; we're not
1811+ usually moving many slots. Years later: under Visual Studio 2022,
1812+ memmove seems just slightly slower than doing it "by hand". */
1813+ for (M = ok ; M > L ; -- M )
1814+ a [M ] = a [M - 1 ];
1815+ a [L ] = pivot ;
1816+ if (has_values ) {
1817+ pivot = v [ok ];
1818+ for (M = ok ; M > L ; -- M )
1819+ v [M ] = v [M - 1 ];
1820+ v [L ] = pivot ;
17481821 }
17491822 }
1823+ #endif // pick binary or regular insertion sort
17501824 return 0 ;
17511825
17521826 fail :
@@ -2559,10 +2633,10 @@ merge_force_collapse(MergeState *ms)
25592633/* Compute a good value for the minimum run length; natural runs shorter
25602634 * than this are boosted artificially via binary insertion.
25612635 *
2562- * If n < 64, return n (it's too small to bother with fancy stuff).
2563- * Else if n is an exact power of 2, return 32 .
2564- * Else return an int k, 32 <= k <= 64 , such that n/k is close to, but
2565- * strictly less than, an exact power of 2.
2636+ * If n < MAX_MINRUN return n (it's too small to bother with fancy stuff).
2637+ * Else if n is an exact power of 2, return MAX_MINRUN / 2 .
2638+ * Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN , such that n/k is
2639+ * close to, but strictly less than, an exact power of 2.
25662640 *
25672641 * See listsort.txt for more info.
25682642 */
@@ -2572,7 +2646,7 @@ merge_compute_minrun(Py_ssize_t n)
25722646 Py_ssize_t r = 0 ; /* becomes 1 if any 1 bits are shifted off */
25732647
25742648 assert (n >= 0 );
2575- while (n >= 64 ) {
2649+ while (n >= MAX_MINRUN ) {
25762650 r |= n & 1 ;
25772651 n >>= 1 ;
25782652 }
@@ -2956,7 +3030,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
29563030 if (n < minrun ) {
29573031 const Py_ssize_t force = nremaining <= minrun ?
29583032 nremaining : minrun ;
2959- if (binarysort (& ms , lo , lo . keys + force , lo . keys + n ) < 0 )
3033+ if (binarysort (& ms , & lo , force , n ) < 0 )
29603034 goto fail ;
29613035 n = force ;
29623036 }
0 commit comments