Skip to content

Commit ed1bbfa

Browse files
committed
diff: apply heuristics borrowed from GNU diff for "good enough"
This change adds some checks to decide that the search for the best place to split the diffing process has gone for too long, or long enough while finding a good chunk of matches. They are based on similar heuristics that GNU diff applies and will help in cases in which files are very long and have few common sequences. This brings comparing some large files (~36MB) that are very different from ~1 hour to ~8 seconds, but it will still hit some pathological cases, such as some very large cpp files I created for some benchmarking that still take 1 minute. Benchmark 1: diff test-data/huge-base test-data/huge-very-different Time (mean ± σ): 2.790 s ± 0.005 s [User: 2.714 s, System: 0.063 s] Range (min … max): 2.781 s … 2.798 s 10 runs Warning: Ignoring non-zero exit code. Benchmark 2: ./target/release/diffutils.no-heuristics diff test-data/huge-base test-data/huge-very-different Time (mean ± σ): 4755.084 s ± 172.607 s [User: 4727.169 s, System: 0.330 s] Range (min … max): 4607.522 s … 5121.135 s 10 runs Warning: Ignoring non-zero exit code. Benchmark 3: ./target/release/diffutils diff test-data/huge-base test-data/huge-very-different Time (mean ± σ): 7.197 s ± 0.099 s [User: 7.055 s, System: 0.094 s] Range (min … max): 7.143 s … 7.416 s 10 runs Warning: Ignoring non-zero exit code. Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet system without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options. Summary diff test-data/huge-base test-data/huge-very-different ran 2.58 ± 0.04 times faster than ./target/release/diffutils diff test-data/huge-base test-data/huge-very-different 1704.04 ± 61.93 times faster than ./target/release/diffutils.no-heuristics diff test-data/huge-base test-data/huge-very-different Note that the worse that should happen by heuristics causing the search to end early is a suboptimal diff, but the diff will still be correct and usable with patch.
1 parent 74d2fad commit ed1bbfa

File tree

1 file changed

+50
-1
lines changed

1 file changed

+50
-1
lines changed

src/engine.rs

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ struct Snake {
1616
}
1717

1818
impl Snake {
19+
fn is_good(&self) -> bool {
20+
// This magic number comes from GNU diff.
21+
self.length > 20
22+
}
23+
1924
fn maybe_update(&mut self, x: isize, y: isize, length: isize) {
2025
let length = length as usize;
2126
if length > self.length {
@@ -82,11 +87,26 @@ fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
8287
x >= offset && y >= offset && x < left_length + offset && y < right_length + offset
8388
};
8489

90+
// This constant is the value used by GNU diff; using it should give us
91+
// more similar diffs.
92+
const HIGH_COST: isize = 200;
93+
94+
// This magic number was borrowed from GNU diff - apparently this is a
95+
// good number for modern CPUs.
96+
let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
97+
info!(too_expensive = too_expensive);
98+
8599
let mut best_snake = Snake::default();
86100

87101
let forward_span = tracing::span!(Level::TRACE, "forward");
88102
let backward_span = tracing::span!(Level::TRACE, "backward");
89-
'outer: for _ in 1..max_cost {
103+
'outer: for c in 1..max_cost {
104+
info!(c = c, snake_length = best_snake.length);
105+
// The files appear to be large and too different. Go for good enough
106+
if c > too_expensive {
107+
break 'outer;
108+
}
109+
90110
// Forwards search
91111
forward_diagonals.expand_search();
92112
let fwd = forward_span.enter();
@@ -192,7 +212,21 @@ fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
192212
}
193213
}
194214
drop(bwd);
215+
216+
if c > HIGH_COST && best_snake.is_good() {
217+
info!("met criteria for high cost with good snake heuristic");
218+
break 'outer;
219+
}
220+
}
221+
222+
// If we hit this condition, the search ran too long and found 0 matches.
223+
// Get the best we can do as a split point - furthest diagonal.
224+
if best_snake.length == 0 {
225+
let (x, y) = forward_diagonals.get_furthest_progress();
226+
best_snake.x = x;
227+
best_snake.y = y;
195228
}
229+
196230
info!(
197231
x = best_snake.x,
198232
y = best_snake.y,
@@ -355,6 +389,21 @@ impl Diagonals {
355389
actual >= 0 && (actual as usize) < self.data.len()
356390
}
357391

392+
fn get_furthest_progress(&self) -> (usize, usize) {
393+
let (d, x) = self
394+
.data
395+
.iter()
396+
.enumerate()
397+
.filter(|(d, &x)| x - (*d as isize) >= 0)
398+
.max_by_key(|(_, &x)| x)
399+
.map(|(i, x)| (i as isize, *x))
400+
.unwrap_or((0isize, 0isize));
401+
let y = x - d;
402+
debug_assert!(x >= 0);
403+
debug_assert!(y >= 0);
404+
(x as usize, y as usize)
405+
}
406+
358407
fn expand_search(&mut self) {
359408
let upper = if *self.search_range.end() == self.max_diag {
360409
self.search_range.end() - 1

0 commit comments

Comments
 (0)