diff: track total cost of search and bail if high

kov · kov · commit 79e0bcc404e4 · 2024-10-31T21:37:03.000-03:00
This is the last piece of the puzzle to get somewhat comparable to GNU
diff performance without implementing all of its tricks - although this
one is also used by GNU diff, in its own way. It brings down a diff
which still takes over a minute with the previous commit to under a
second.

  Benchmark 1: diff test-data/b.cpp test-data/c.cpp
    Time (mean ± σ):      2.533 s ±  0.011 s    [User: 2.494 s, System: 0.027 s]
    Range (min … max):    2.519 s …  2.553 s    10 runs

    Warning: Ignoring non-zero exit code.

  Benchmark 2: ./target/release/diffutils.local-heuristics diff test-data/b.cpp test-data/c.cpp
    Time (mean ± σ):     65.798 s ±  1.080 s    [User: 65.367 s, System: 0.053 s]
    Range (min … max):   64.962 s … 68.137 s    10 runs

    Warning: Ignoring non-zero exit code.

  Benchmark 3: ./target/release/diffutils diff test-data/b.cpp test-data/c.cpp
    Time (mean ± σ):     580.6 ms ±   6.5 ms    [User: 521.9 ms, System: 38.8 ms]
    Range (min … max):   570.7 ms … 589.6 ms    10 runs

    Warning: Ignoring non-zero exit code.

  Summary
    ./target/release/diffutils diff test-data/b.cpp test-data/c.cpp ran
      4.36 ± 0.05 times faster than diff test-data/b.cpp test-data/c.cpp
    113.33 ± 2.26 times faster than ./target/release/diffutils.local-heuristics diff test-data/b.cpp test-data/c.cpp

It basically keeps track of how much work we have done overall for a
diff job and enables giving up completely on trying to find ideal split
points if the cost implies we had to trigger the "too expensive"
heuristic too often.

From that point forward it only does naive splitting of the work.
This should not generate diffs which are much worse than doing the
diagonal search, as it should only trigger in cases in which the
files are so different it won't find good split points anyway.

This is another case in which GNU diff's additional work with hashing
and splitting large chunks of inclusion / deletion from the diff work
and trying harder to find ideal splits seem to cause it to perform
slightly poorer:

That said, GNU diff probably still generates better diffs not due to
this, but due to its post-processing of the results, trying to create
more hunks with nearby changes staying close to each other, which we
do not do (but we didn't do that before anyway).
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,6 +22,7 @@ same-file = "1.0.6"
 unicode-width = "0.2.0"
 tracing = "0.1.40"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+rand = "0.8.5"
 
 [dev-dependencies]
 pretty_assertions = "1.4.0"
diff --git a/src/engine.rs b/src/engine.rs
@@ -6,6 +6,7 @@
 use std::fmt::Debug;
 use std::ops::{Index, IndexMut, RangeInclusive};
 
+use rand::Rng as _;
 use tracing::{info, instrument, trace, Level};
 
 #[derive(Debug, Default, PartialEq)]
@@ -43,12 +44,44 @@ impl Snake {
 fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
     left: &[T],
     right: &[T],
+    total_cost: &mut usize,
 ) -> Snake {
     let left_length = left.len() as isize;
     let right_length = right.len() as isize;
 
     let max_cost = left_length + right_length;
 
+    // This constant is the value used by GNU diff; using it should give us
+    // more similar diffs.
+    const HIGH_COST: isize = 200;
+
+    // This magic number was borrowed from GNU diff - apparently this is a
+    // good number for modern CPUs.
+    let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
+    info!(too_expensive = too_expensive);
+
+    // We've been constantly hitting the too expensive heuristic, this means the
+    // files are too different for us to get a good diff in reasonable amount of
+    // time. Do naive splits from now on.
+    if *total_cost as isize > too_expensive * 10 {
+        info!(
+            total_cost = total_cost,
+            "hit too costly overall heuristic, creating naive split"
+        );
+        let mut rng = rand::thread_rng();
+        let x = if left_length == 0 {
+            0
+        } else {
+            rng.gen_range(0..left.len())
+        };
+        let y = if right_length == 0 {
+            0
+        } else {
+            rng.gen_range(0..right.len())
+        };
+        return Snake { x, y, length: 0 };
+    }
+
     // For collections of different sizes, the diagonals will not neatly balance. That means the
     // "middle" diagonal for the backwards search will be offset from the forward one, so we need
     // to keep track of that so we start at the right point.
@@ -87,20 +120,13 @@ fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
         x >= offset && y >= offset && x < left_length + offset && y < right_length + offset
     };
 
-    // This constant is the value used by GNU diff; using it should give us
-    // more similar diffs.
-    const HIGH_COST: isize = 200;
-
-    // This magic number was borrowed from GNU diff - apparently this is a
-    // good number for modern CPUs.
-    let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
-    info!(too_expensive = too_expensive);
-
     let mut best_snake = Snake::default();
 
     let forward_span = tracing::span!(Level::TRACE, "forward");
     let backward_span = tracing::span!(Level::TRACE, "backward");
     'outer: for c in 1..max_cost {
+        *total_cost += 1;
+
         info!(c = c, snake_length = best_snake.length);
         // The files appear to be large and too different. Go for good enough
         if c > too_expensive {
@@ -253,7 +279,8 @@ pub fn diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
 ) -> Vec<Edit<'a, T>> {
     trace!(left_length = left.len(), right_length = right.len());
     let mut edits = vec![];
-    do_diff(left, right, &mut edits);
+    let mut total_cost = 0;
+    do_diff(left, right, &mut edits, &mut total_cost);
     edits
 }
 
@@ -262,6 +289,7 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
     left: &'a [T],
     right: &'a [T],
     edits: &mut Vec<Edit<'a, T>>,
+    total_cost: &mut usize,
 ) {
     if left.is_empty() {
         right.iter().for_each(|r| edits.push(Edit::Insert(r)));
@@ -296,7 +324,7 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
     let left_remaining = &left[leading_matches..left.len() - trailing_matches];
     let right_remaining = &right[leading_matches..right.len() - trailing_matches];
 
-    let snake = find_split_point(left_remaining, right_remaining);
+    let snake = find_split_point(left_remaining, right_remaining, total_cost);
 
     trace!(x = snake.x, y = snake.y, length = snake.length, "snake");
 
@@ -321,8 +349,8 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
             "split"
         );
 
-        do_diff(l1, r1, edits);
-        do_diff(l2, r2, edits);
+        do_diff(l1, r1, edits, total_cost);
+        do_diff(l2, r2, edits, total_cost);
     }
 
     // Finally add the trailing matches.