timsort
diff --git a/‎README.md‎
Lines changed: 53 additions & 4 deletions b/‎README.md‎
Lines changed: 53 additions & 4 deletions
diff --git a/‎benchmarks/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/bench.cpp‎
Lines changed: 0 additions & 126 deletions b/‎benchmarks/bench.cpp‎
Lines changed: 0 additions & 126 deletions
diff --git a/‎benchmarks/bench_merge.cpp‎
Lines changed: 130 additions & 0 deletions b/‎benchmarks/bench_merge.cpp‎
Lines changed: 130 additions & 0 deletions
@@ -26,10 +26,19 @@ can't fallback to a O(n log² n) algorithm when there isn't enough extra heap me
   type such as `void`.
 
 
+Merging sorted ranges efficiently is an important part of the TimSort algorithm. This library exposes its merge
+algorithm in the public API. According to the benchmarks, `gfx::timmerge` is slower than `std::inplace_merge` on
+heavily/randomly overlapping subranges of simple elements, but it is faster for complex elements such as `std::string`
+and on sparsely overlapping subranges. `gfx::timmerge` should be usable as a drop-in replacement for
+`std::inplace_merge`, with the difference that it can't fallback to a O(n log n) algorithm when there isn't enough
+extra heap memory available. Like `gfx::timsort`, `gfx::timmerge` can take a projection function and avoids using the
+postfix `++` or `--` operators.
+
+
 The full list of available signatures is as follows (in namespace `gfx`):
 
 ```cpp
-// Overloads taking a pair of iterators
+// timsort overloads taking a pair of iterators
 
 template <typename RandomAccessIterator>
 void timsort(RandomAccessIterator const first, RandomAccessIterator const last);
@@ -42,7 +51,7 @@ template <typename RandomAccessIterator, typename Compare, typename Projection>
 void timsort(RandomAccessIterator const first, RandomAccessIterator const last,
              Compare compare, Projection projection);
 
-// Overloads taking a range
+// timsort overloads taking a range
 
 template <typename RandomAccessRange>
 void timsort(RandomAccessRange &range);
@@ -52,6 +61,20 @@ void timsort(RandomAccessRange &range, Compare compare);
 
 template <typename RandomAccessRange, typename Compare, typename Projection>
 void timsort(RandomAccessRange &range, Compare compare, Projection projection);
+
+// timmerge overloads
+
+template <typename RandomAccessIterator>
+void timmerge(RandomAccessIterator first, RandomAccessIterator middle,
+              RandomAccessIterator last);
+
+template <typename RandomAccessIterator, typename Compare>
+void timmerge(RandomAccessIterator first, RandomAccessIterator middle,
+              RandomAccessIterator last, Compare compare);
+
+template <typename RandomAccessIterator, typename Compare, typename Projection>
+void timmerge(RandomAccessIterator first, RandomAccessIterator middle,
+              RandomAccessIterator last, Compare compare, Projection projection);
 ```
 
 ## EXAMPLE
@@ -102,7 +125,7 @@ conan install timsort/2.0.2
 
 ## DIAGNOSTICS & INFORMATION
 
-A few configuration macros allow gfx::timsort to emit diagnostic, which might be helpful to diagnose issues:
+A few configuration macros allow `gfx::timsort` and `gfx::timmerge` to emit diagnostic, which might be helpful to diagnose issues:
 * Defining `GFX_TIMSORT_ENABLE_ASSERT` inserts assertions in key locations in the algorithm to avoid logic errors.
 * Defining `GFX_TIMSORT_ENABLE_LOG` inserts logs in key locations, which allow to follow more closely the flow of the algorithm.
 
@@ -130,7 +153,7 @@ built with CMake:
 Benchmarks are available in the `benchmarks` subdirectory, and can be constructed directly by passing `BUILD_BENCHMARKS=ON`
 variable to CMake during the configuration step.
 
-Example output (timing scale: sec.):
+Example bench_sort output (timing scale: sec.):
 
     c++ -v
     Apple LLVM version 7.0.0 (clang-700.0.72)
@@ -171,3 +194,29 @@ Example output (timing scale: sec.):
     std::sort        0.402458
     std::stable_sort 2.436326
     timsort          0.298639
+
+Example bench_merge output (timing scale: milliseconds; omitted detailed results for different
+middle iterator positions, reformatted to improve readability):
+
+    c++ -v
+    Using built-in specs.
+    ...
+    Target: x86_64-pc-linux-gnu
+    ...
+    gcc version 10.2.0 (GCC)
+    c++ -I ../include -Wall -Wextra -g -DNDEBUG -O2 -std=c++11 bench_merge.cpp -o bench_merge
+    ./bench_merge
+    size	100000
+    element type\algorithm:      	std::inplace_merge	timmerge
+    RANDOMIZED SEQUENCE
+    [int] approx. average        	 33.404430        	 37.047990
+    [std::string] approx. average	324.964249        	210.297207
+    REVERSED SEQUENCE
+    [int] approx. average        	 11.441404        	  4.017482
+    [std::string] approx. average	305.649503        	114.773898
+    SORTED SEQUENCE
+    [int] approx. average        	  4.291098        	  0.105571
+    [std::string] approx. average	158.238114        	  0.273858
+
+Detailed bench_merge results for different middle iterator positions can be found at
+https://github.com/timsort/cpp-TimSort/wiki/Benchmark-results
@@ -1,5 +1,5 @@
 
-foreach(filename bench.cpp)
+foreach(filename bench_merge.cpp bench_sort.cpp)
     get_filename_component(name ${filename} NAME_WE)
     add_executable(${name} ${filename})
     target_link_libraries(${name} PRIVATE gfx::timsort)
 
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Igor Kushnir <[email protected]>.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+#include <cstdlib>
+#include <algorithm>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <valarray>
+#include <vector>
+#include <gfx/timsort.hpp>
+#include "benchmarker.hpp"
+
+namespace
+{
+    std::vector<int> generate_middle_positions(int size) {
+        std::vector<int> result = {
+            0, 1, 2, 5, 100, size/100, size/20, size/5, size/3, size/2, 3*size/4,
+            6*size/7, 24*size/25, 90*size/91, size-85, size-8, size-2, size-1, size
+        };
+
+        // The code below can remove or reorder elements if size is small.
+
+        auto logical_end = std::remove_if(result.begin(), result.end(), [size](int middle) {
+            return middle < 0 || middle > size;
+        });
+        result.erase(logical_end, result.end());
+
+        std::sort(result.begin(), result.end());
+        logical_end = std::unique(result.begin(), result.end());
+        result.erase(logical_end, result.end());
+
+        return result;
+    }
+
+    using Result = std::valarray<double>;
+    Result zeroResult() { return Result(2); }
+}
+
+template <typename value_t>
+struct Bench {
+    void operator()(const std::vector<value_t> &source) const {
+        const int size = static_cast<int>(source.size());
+        const auto middle_positions = generate_middle_positions(size);
+
+        int prev_middle = 0;
+        auto prev_result = zeroResult();
+        auto result_sum = zeroResult();
+
+        std::cerr << "middle\\algorithm:\tstd::inplace_merge\ttimmerge" << std::endl;
+        constexpr int width = 10;
+        constexpr const char* padding = "        \t";
+
+        std::vector<value_t> a(source.size());
+        for (auto middle : middle_positions) {
+            std::copy(source.begin(), source.end(), a.begin());
+            std::sort(a.begin(), a.begin() + middle);
+            std::sort(a.begin() + middle, a.end());
+            const auto result = run(a, middle);
+
+            if (middle != prev_middle) {
+                // Trapezoidal rule for approximating the definite integral.
+                result_sum += 0.5 * (result + prev_result)
+                                  * static_cast<double>(middle - prev_middle);
+                prev_middle = middle;
+            }
+            prev_result = result;
+
+            std::cerr << std::setw(width) << middle
+                      << "       \t" << std::setw(width) << result[0]
+                      << padding << std::setw(width) << result[1]
+                      << std::endl;
+        }
+
+        if (size != 0) {
+            result_sum /= static_cast<double>(size);
+            std::cerr << "approx. average"
+                      << "  \t" << std::setw(width) << result_sum[0]
+                      << padding << std::setw(width) << result_sum[1]
+                      << std::endl;
+        }
+    }
+
+private:
+    static Result run(const std::vector<value_t> &a, const int middle) {
+        std::vector<value_t> b(a.size());
+        const auto assert_is_sorted = [&b] {
+            if (!std::is_sorted(b.cbegin(), b.cend())) {
+                std::cerr << "Not sorted!" << std::endl;
+                std::abort();
+            }
+        };
+
+        auto result = zeroResult();
+        for (auto *total_time_ms : { &result[0], &result[1] }) {
+            using Clock = std::chrono::steady_clock;
+            decltype(Clock::now() - Clock::now()) total_time{0};
+
+            for (int i = 0; i < 100; ++i) {
+                std::copy(a.begin(), a.end(), b.begin());
+                const auto time_begin = Clock::now();
+
+                if (total_time_ms == &result[0]) {
+                    std::inplace_merge(b.begin(), b.begin() + middle, b.end());
+                } else {
+                    gfx::timmerge(b.begin(), b.begin() + middle, b.end());
+                }
+
+                const auto time_end = Clock::now();
+                total_time += time_end - time_begin;
+
+                // Verifying that b is sorted should prevent the compiler from optimizing anything out.
+                assert_is_sorted();
+            }
+
+            *total_time_ms = std::chrono::duration_cast<
+                                std::chrono::microseconds>(total_time).count() / 1000.0;
+        }
+        return result;
+    }
+};
+
+int main(int argc, const char *argv[]) {
+    const int size = argc > 1 ? std::stoi(argv[1]) : 100 * 1000;
+    Benchmarker<Bench> benchmarker(size);
+    benchmarker.run();
+}