Skip to content

Commit 79e0bcc

Browse files
committed
diff: track total cost of search and bail if high
This is the last piece of the puzzle to get somewhat comparable to GNU diff performance without implementing all of its tricks - although this one is also used by GNU diff, in its own way. It brings down a diff which still takes over a minute with the previous commit to under a second. Benchmark 1: diff test-data/b.cpp test-data/c.cpp Time (mean ± σ): 2.533 s ± 0.011 s [User: 2.494 s, System: 0.027 s] Range (min … max): 2.519 s … 2.553 s 10 runs Warning: Ignoring non-zero exit code. Benchmark 2: ./target/release/diffutils.local-heuristics diff test-data/b.cpp test-data/c.cpp Time (mean ± σ): 65.798 s ± 1.080 s [User: 65.367 s, System: 0.053 s] Range (min … max): 64.962 s … 68.137 s 10 runs Warning: Ignoring non-zero exit code. Benchmark 3: ./target/release/diffutils diff test-data/b.cpp test-data/c.cpp Time (mean ± σ): 580.6 ms ± 6.5 ms [User: 521.9 ms, System: 38.8 ms] Range (min … max): 570.7 ms … 589.6 ms 10 runs Warning: Ignoring non-zero exit code. Summary ./target/release/diffutils diff test-data/b.cpp test-data/c.cpp ran 4.36 ± 0.05 times faster than diff test-data/b.cpp test-data/c.cpp 113.33 ± 2.26 times faster than ./target/release/diffutils.local-heuristics diff test-data/b.cpp test-data/c.cpp It basically keeps track of how much work we have done overall for a diff job and enables giving up completely on trying to find ideal split points if the cost implies we had to trigger the "too expensive" heuristic too often. From that point forward it only does naive splitting of the work. This should not generate diffs which are much worse than doing the diagonal search, as it should only trigger in cases in which the files are so different it won't find good split points anyway. This is another case in which GNU diff's additional work with hashing and splitting large chunks of inclusion / deletion from the diff work and trying harder to find ideal splits seem to cause it to perform slightly poorer: That said, GNU diff probably still generates better diffs not due to this, but due to its post-processing of the results, trying to create more hunks with nearby changes staying close to each other, which we do not do (but we didn't do that before anyway).
1 parent ed1bbfa commit 79e0bcc

File tree

3 files changed

+126
-13
lines changed

3 files changed

+126
-13
lines changed

Cargo.lock

Lines changed: 84 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ same-file = "1.0.6"
2222
unicode-width = "0.2.0"
2323
tracing = "0.1.40"
2424
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
25+
rand = "0.8.5"
2526

2627
[dev-dependencies]
2728
pretty_assertions = "1.4.0"

src/engine.rs

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
use std::fmt::Debug;
77
use std::ops::{Index, IndexMut, RangeInclusive};
88

9+
use rand::Rng as _;
910
use tracing::{info, instrument, trace, Level};
1011

1112
#[derive(Debug, Default, PartialEq)]
@@ -43,12 +44,44 @@ impl Snake {
4344
fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
4445
left: &[T],
4546
right: &[T],
47+
total_cost: &mut usize,
4648
) -> Snake {
4749
let left_length = left.len() as isize;
4850
let right_length = right.len() as isize;
4951

5052
let max_cost = left_length + right_length;
5153

54+
// This constant is the value used by GNU diff; using it should give us
55+
// more similar diffs.
56+
const HIGH_COST: isize = 200;
57+
58+
// This magic number was borrowed from GNU diff - apparently this is a
59+
// good number for modern CPUs.
60+
let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
61+
info!(too_expensive = too_expensive);
62+
63+
// We've been constantly hitting the too expensive heuristic, this means the
64+
// files are too different for us to get a good diff in reasonable amount of
65+
// time. Do naive splits from now on.
66+
if *total_cost as isize > too_expensive * 10 {
67+
info!(
68+
total_cost = total_cost,
69+
"hit too costly overall heuristic, creating naive split"
70+
);
71+
let mut rng = rand::thread_rng();
72+
let x = if left_length == 0 {
73+
0
74+
} else {
75+
rng.gen_range(0..left.len())
76+
};
77+
let y = if right_length == 0 {
78+
0
79+
} else {
80+
rng.gen_range(0..right.len())
81+
};
82+
return Snake { x, y, length: 0 };
83+
}
84+
5285
// For collections of different sizes, the diagonals will not neatly balance. That means the
5386
// "middle" diagonal for the backwards search will be offset from the forward one, so we need
5487
// to keep track of that so we start at the right point.
@@ -87,20 +120,13 @@ fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
87120
x >= offset && y >= offset && x < left_length + offset && y < right_length + offset
88121
};
89122

90-
// This constant is the value used by GNU diff; using it should give us
91-
// more similar diffs.
92-
const HIGH_COST: isize = 200;
93-
94-
// This magic number was borrowed from GNU diff - apparently this is a
95-
// good number for modern CPUs.
96-
let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
97-
info!(too_expensive = too_expensive);
98-
99123
let mut best_snake = Snake::default();
100124

101125
let forward_span = tracing::span!(Level::TRACE, "forward");
102126
let backward_span = tracing::span!(Level::TRACE, "backward");
103127
'outer: for c in 1..max_cost {
128+
*total_cost += 1;
129+
104130
info!(c = c, snake_length = best_snake.length);
105131
// The files appear to be large and too different. Go for good enough
106132
if c > too_expensive {
@@ -253,7 +279,8 @@ pub fn diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
253279
) -> Vec<Edit<'a, T>> {
254280
trace!(left_length = left.len(), right_length = right.len());
255281
let mut edits = vec![];
256-
do_diff(left, right, &mut edits);
282+
let mut total_cost = 0;
283+
do_diff(left, right, &mut edits, &mut total_cost);
257284
edits
258285
}
259286

@@ -262,6 +289,7 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
262289
left: &'a [T],
263290
right: &'a [T],
264291
edits: &mut Vec<Edit<'a, T>>,
292+
total_cost: &mut usize,
265293
) {
266294
if left.is_empty() {
267295
right.iter().for_each(|r| edits.push(Edit::Insert(r)));
@@ -296,7 +324,7 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
296324
let left_remaining = &left[leading_matches..left.len() - trailing_matches];
297325
let right_remaining = &right[leading_matches..right.len() - trailing_matches];
298326

299-
let snake = find_split_point(left_remaining, right_remaining);
327+
let snake = find_split_point(left_remaining, right_remaining, total_cost);
300328

301329
trace!(x = snake.x, y = snake.y, length = snake.length, "snake");
302330

@@ -321,8 +349,8 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
321349
"split"
322350
);
323351

324-
do_diff(l1, r1, edits);
325-
do_diff(l2, r2, edits);
352+
do_diff(l1, r1, edits, total_cost);
353+
do_diff(l2, r2, edits, total_cost);
326354
}
327355

328356
// Finally add the trailing matches.

0 commit comments

Comments
 (0)