Skip to content

Commit 64e1ae2

Browse files
committed
diff: remove redundant hash map lookup of uncommon shared words
1 parent 5c52b4e commit 64e1ae2

File tree

1 file changed

+23
-31
lines changed

1 file changed

+23
-31
lines changed

lib/src/diff.rs

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,13 @@ impl<'input> Histogram<'input> {
122122
Histogram { word_to_positions }
123123
}
124124

125-
fn build_count_to_words(&self) -> BTreeMap<usize, Vec<&'input BStr>> {
126-
let mut count_to_words: BTreeMap<usize, Vec<&BStr>> = BTreeMap::new();
127-
for (word, ranges) in &self.word_to_positions {
128-
count_to_words.entry(ranges.len()).or_default().push(word);
125+
fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<(&'input BStr, &Vec<WordPosition>)>> {
126+
let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
127+
for (word, positions) in &self.word_to_positions {
128+
let entries = count_to_entries.entry(positions.len()).or_default();
129+
entries.push((*word, positions));
129130
}
130-
count_to_words
131+
count_to_entries
131132
}
132133
}
133134

@@ -233,47 +234,38 @@ fn unchanged_ranges_lcs(
233234
) -> Vec<(Range<usize>, Range<usize>)> {
234235
let max_occurrences = 100;
235236
let left_histogram = Histogram::calculate(left, max_occurrences);
236-
let left_count_to_words = left_histogram.build_count_to_words();
237-
if *left_count_to_words.keys().next().unwrap() > max_occurrences {
237+
let left_count_to_entries = left_histogram.build_count_to_entries();
238+
if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
238239
// If there are very many occurrences of all words, then we just give up.
239240
return vec![];
240241
}
241242
let right_histogram = Histogram::calculate(right, max_occurrences);
242243
// Look for words with few occurrences in `left` (could equally well have picked
243244
// `right`?). If any of them also occur in `right`, then we add the words to
244245
// the LCS.
245-
let Some(uncommon_shared_words) = left_count_to_words
246-
.iter()
247-
.map(|(left_count, left_words)| -> Vec<&BStr> {
248-
left_words
246+
let Some(uncommon_shared_word_positions) =
247+
left_count_to_entries.values().find_map(|left_entries| {
248+
let mut both_positions = left_entries
249249
.iter()
250-
.copied()
251-
.filter(|left_word| {
252-
let right_count = right_histogram
253-
.word_to_positions
254-
.get(left_word)
255-
.map_or(0, |right_positions| right_positions.len());
256-
*left_count == right_count
250+
.filter_map(|&(word, left_positions)| {
251+
let right_positions = right_histogram.word_to_positions.get(word)?;
252+
(left_positions.len() == right_positions.len())
253+
.then_some((left_positions, right_positions))
257254
})
258-
.collect()
255+
.peekable();
256+
both_positions.peek().is_some().then_some(both_positions)
259257
})
260-
.find(|words| !words.is_empty())
261258
else {
262259
return vec![];
263260
};
264261

265262
// [(index into ranges, serial to identify {word, occurrence #})]
266-
let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) = uncommon_shared_words
267-
.iter()
268-
.flat_map(|word| {
269-
let left_occurrences = &left_histogram.word_to_positions[word];
270-
let right_occurrences = &right_histogram.word_to_positions[word];
271-
assert_eq!(left_occurrences.len(), right_occurrences.len());
272-
iter::zip(left_occurrences, right_occurrences)
273-
})
274-
.enumerate()
275-
.map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
276-
.unzip();
263+
let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) =
264+
uncommon_shared_word_positions
265+
.flat_map(|(lefts, rights)| iter::zip(lefts, rights))
266+
.enumerate()
267+
.map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
268+
.unzip();
277269
left_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
278270
right_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
279271
let left_index_by_right_index: Vec<usize> = {

0 commit comments

Comments
 (0)