Skip to content

Commit 707b1ff

Browse files
committed
feat(view): use word-level tokenization for intra-line diff
Replace the character-level Myers diff tokenizer in `intra_line_changes` with a word-level one. Alphanumeric+underscore runs are one token; whitespace and punctuation stay as single-char tokens. With character-level diff, substituting one word marks several small fragments as changed. When the theme defines `diff.delta.text` with an explicit background, that overlay covers most of the line and hides the line-level diff color. Word granularity limits the overlay to the changed word(s) only. `word_token_char_starts` precomputes char offsets per token so token-index hunks from imara-diff can be mapped back to char-column ranges.
1 parent d3887c3 commit 707b1ff

1 file changed

Lines changed: 149 additions & 24 deletions

File tree

helix-view/src/diff_session.rs

Lines changed: 149 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,85 @@ impl<'a> imara_diff::TokenSource for RopeLines<'a> {
2323
}
2424
}
2525

26-
/// A `TokenSource` that yields individual chars, giving char-level token indices
27-
/// in imara-diff hunks. Used by `intra_line_changes` for per-character diff.
28-
struct CharSlice<'a>(&'a [char]);
26+
/// Iterator that yields word-level tokens from a string slice.
27+
/// Alphanumeric + underscore runs are emitted as a single token; all other chars
28+
/// (whitespace, punctuation, newlines) are emitted individually. This gives
29+
/// coarser intra-line diffs than char-level: only whole words are marked as changed.
30+
struct WordTokenIter<'a> {
31+
text: &'a str,
32+
byte_pos: usize,
33+
}
34+
35+
impl<'a> Iterator for WordTokenIter<'a> {
36+
type Item = &'a str;
37+
38+
fn next(&mut self) -> Option<Self::Item> {
39+
if self.byte_pos >= self.text.len() {
40+
return None;
41+
}
42+
let rest = &self.text[self.byte_pos..];
43+
let first = rest.chars().next()?;
44+
let end_byte = if first.is_alphanumeric() || first == '_' {
45+
rest.char_indices()
46+
.take_while(|(_, c)| c.is_alphanumeric() || *c == '_')
47+
.last()
48+
.map(|(i, c)| i + c.len_utf8())
49+
.unwrap_or(first.len_utf8())
50+
} else {
51+
first.len_utf8()
52+
};
53+
let token = &rest[..end_byte];
54+
self.byte_pos += end_byte;
55+
Some(token)
56+
}
57+
}
2958

30-
impl<'a> TokenSource for CharSlice<'a> {
31-
type Token = char;
32-
type Tokenizer = std::iter::Copied<std::slice::Iter<'a, char>>;
59+
/// A `TokenSource` that yields word-level tokens from a `&str`.
60+
/// Used by `intra_line_changes` so the Myers diff operates on whole words
61+
/// rather than individual characters.
62+
struct WordSlice<'a>(&'a str);
63+
64+
impl<'a> TokenSource for WordSlice<'a> {
65+
type Token = &'a str;
66+
type Tokenizer = WordTokenIter<'a>;
3367

3468
fn tokenize(&self) -> Self::Tokenizer {
35-
self.0.iter().copied()
69+
WordTokenIter {
70+
text: self.0,
71+
byte_pos: 0,
72+
}
3673
}
3774

3875
fn estimate_tokens(&self) -> u32 {
39-
self.0.len() as u32
76+
(self.0.len() / 4).max(1) as u32
77+
}
78+
}
79+
80+
/// Returns the char-index (within `text`) at which each word token starts.
81+
/// `result[i]` is the char start of token `i`; `result.len()` equals the token count.
82+
/// If `tok < result.len()`, the token starts at `result[tok]` and ends at
83+
/// `result[tok + 1]` (or at `text.chars().count()` for the last token).
84+
fn word_token_char_starts(text: &str) -> Vec<usize> {
85+
let mut starts = Vec::new();
86+
let mut char_pos = 0usize;
87+
let mut byte_pos = 0usize;
88+
while byte_pos < text.len() {
89+
let rest = &text[byte_pos..];
90+
let first = rest.chars().next().unwrap();
91+
starts.push(char_pos);
92+
let end_byte = if first.is_alphanumeric() || first == '_' {
93+
rest.char_indices()
94+
.take_while(|(_, c)| c.is_alphanumeric() || *c == '_')
95+
.last()
96+
.map(|(i, c)| i + c.len_utf8())
97+
.unwrap_or(first.len_utf8())
98+
} else {
99+
first.len_utf8()
100+
};
101+
char_pos += rest[..end_byte].chars().count();
102+
byte_pos += end_byte;
40103
}
104+
starts
41105
}
42106

43107
/// A diff session pairs two views for side-by-side diff comparison.
@@ -52,7 +116,7 @@ pub struct DiffSession {
52116
/// Shared hunk list. Stored as Arc so callers can take a cheap reference-counted
53117
/// snapshot for closures and annotations without cloning the full Vec each frame.
54118
hunks: Arc<Vec<Hunk>>,
55-
/// Character-level diff results cached per hunk, parallel to `hunks`.
119+
/// Word-level intra-line diff results cached per hunk, parallel to `hunks`.
56120
/// Stored as Arc so render closures can take a cheap snapshot without cloning.
57121
/// Populated in `compute_hunks` and replaced on each recomputation.
58122
/// Pure insertions/deletions store empty vecs (no character diff needed).
@@ -324,8 +388,10 @@ impl InlineChange {
324388
}
325389
}
326390

327-
/// Compute character-level diff for a single hunk.
328-
/// Returns per-line column ranges for each side indicating which characters changed.
391+
/// Compute word-level intra-line diff for a single hunk.
392+
/// Returns per-line column ranges for each side indicating which words (or
393+
/// non-word chars) changed. Whole alphanumeric+underscore runs are treated as
394+
/// one token, so only the changed words are highlighted rather than individual chars.
329395
pub fn intra_line_changes(
330396
rope_a: &Rope,
331397
rope_b: &Rope,
@@ -339,14 +405,24 @@ pub fn intra_line_changes(
339405
let text_a: String = rope_a.slice(a_start..a_end).into();
340406
let text_b: String = rope_b.slice(b_start..b_end).into();
341407

342-
// Use char-level tokenization so hunk offsets are char indices, not line indices.
343-
let chars_a: Vec<char> = text_a.chars().collect();
344-
let chars_b: Vec<char> = text_b.chars().collect();
345-
let input = InternedInput::new(CharSlice(&chars_a), CharSlice(&chars_b));
408+
// Precompute char-start of each word token so token indices can be mapped back
409+
// to char-column ranges after the Myers diff produces token-level hunks.
410+
let tok_starts_a = word_token_char_starts(&text_a);
411+
let tok_starts_b = word_token_char_starts(&text_b);
412+
let total_chars_a = text_a.chars().count();
413+
let total_chars_b = text_b.chars().count();
414+
415+
let input = InternedInput::new(WordSlice(&text_a), WordSlice(&text_b));
346416
let diff = Diff::compute(Algorithm::Myers, &input);
347417

348-
let char_to_line_col = |base_char: usize, offset: u32, rope: &Rope| -> (usize, usize) {
349-
let char_idx = base_char + offset as usize;
418+
// Convert a token index into a char offset within the hunk text.
419+
// `tok` == number of tokens means "end of text".
420+
let tok_to_char = |starts: &[usize], total: usize, tok: u32| -> usize {
421+
*starts.get(tok as usize).unwrap_or(&total)
422+
};
423+
424+
let char_to_line_col = |base_char: usize, char_offset: usize, rope: &Rope| -> (usize, usize) {
425+
let char_idx = base_char + char_offset;
350426
let line = rope.char_to_line(char_idx);
351427
let line_start = rope.line_to_char(line);
352428
(line, char_idx - line_start)
@@ -355,10 +431,12 @@ pub fn intra_line_changes(
355431
let mut changes_a = Vec::new();
356432
let mut changes_b = Vec::new();
357433

358-
for char_hunk in diff.hunks() {
359-
if !char_hunk.before.is_empty() {
360-
let (start_line, start_col) = char_to_line_col(a_start, char_hunk.before.start, rope_a);
361-
let (end_line, end_col) = char_to_line_col(a_start, char_hunk.before.end, rope_a);
434+
for tok_hunk in diff.hunks() {
435+
if !tok_hunk.before.is_empty() {
436+
let sc = tok_to_char(&tok_starts_a, total_chars_a, tok_hunk.before.start);
437+
let ec = tok_to_char(&tok_starts_a, total_chars_a, tok_hunk.before.end);
438+
let (start_line, start_col) = char_to_line_col(a_start, sc, rope_a);
439+
let (end_line, end_col) = char_to_line_col(a_start, ec, rope_a);
362440
// Split across lines if the change spans multiple lines
363441
for line in start_line..=end_line {
364442
let cs = if line == start_line { start_col } else { 0 };
@@ -374,9 +452,11 @@ pub fn intra_line_changes(
374452
});
375453
}
376454
}
377-
if !char_hunk.after.is_empty() {
378-
let (start_line, start_col) = char_to_line_col(b_start, char_hunk.after.start, rope_b);
379-
let (end_line, end_col) = char_to_line_col(b_start, char_hunk.after.end, rope_b);
455+
if !tok_hunk.after.is_empty() {
456+
let sc = tok_to_char(&tok_starts_b, total_chars_b, tok_hunk.after.start);
457+
let ec = tok_to_char(&tok_starts_b, total_chars_b, tok_hunk.after.end);
458+
let (start_line, start_col) = char_to_line_col(b_start, sc, rope_b);
459+
let (end_line, end_col) = char_to_line_col(b_start, ec, rope_b);
380460
for line in start_line..=end_line {
381461
let cs = if line == start_line { start_col } else { 0 };
382462
let ce = if line == end_line {
@@ -948,4 +1028,49 @@ mod tests {
9481028
"cache must reflect updated content"
9491029
);
9501030
}
1031+
1032+
#[test]
1033+
fn intra_line_diff_uses_word_granularity() {
1034+
// "hello world\n" vs "hello earth\n": only the word "world"/"earth" differs.
1035+
// Word-level diff should highlight exactly col 6..11 on each side,
1036+
// not multiple sub-word spans as character-level diff would produce.
1037+
let rope_a = Rope::from("hello world\n");
1038+
let rope_b = Rope::from("hello earth\n");
1039+
let hunk = helix_vcs::Hunk {
1040+
before: 0..1,
1041+
after: 0..1,
1042+
};
1043+
let (changes_a, changes_b) = intra_line_changes(&rope_a, &rope_b, &hunk);
1044+
1045+
assert_eq!(
1046+
changes_a.len(),
1047+
1,
1048+
"expected one word-level change on A side"
1049+
);
1050+
assert_eq!(
1051+
changes_b.len(),
1052+
1,
1053+
"expected one word-level change on B side"
1054+
);
1055+
1056+
assert_eq!(changes_a[0].doc_line, 0);
1057+
assert_eq!(
1058+
changes_a[0].col_start, 6,
1059+
"change should start at 'w' of 'world'"
1060+
);
1061+
assert_eq!(
1062+
changes_a[0].col_end, 11,
1063+
"change should end after 'd' of 'world'"
1064+
);
1065+
1066+
assert_eq!(changes_b[0].doc_line, 0);
1067+
assert_eq!(
1068+
changes_b[0].col_start, 6,
1069+
"change should start at 'e' of 'earth'"
1070+
);
1071+
assert_eq!(
1072+
changes_b[0].col_end, 11,
1073+
"change should end after 'h' of 'earth'"
1074+
);
1075+
}
9511076
}

0 commit comments

Comments
 (0)