@@ -23,21 +23,85 @@ impl<'a> imara_diff::TokenSource for RopeLines<'a> {
2323 }
2424}
2525
26- /// A `TokenSource` that yields individual chars, giving char-level token indices
27- /// in imara-diff hunks. Used by `intra_line_changes` for per-character diff.
28- struct CharSlice < ' a > ( & ' a [ char ] ) ;
26+ /// Iterator that yields word-level tokens from a string slice.
27+ /// Alphanumeric + underscore runs are emitted as a single token; all other chars
28+ /// (whitespace, punctuation, newlines) are emitted individually. This gives
29+ /// coarser intra-line diffs than char-level: only whole words are marked as changed.
30+ struct WordTokenIter < ' a > {
31+ text : & ' a str ,
32+ byte_pos : usize ,
33+ }
34+
35+ impl < ' a > Iterator for WordTokenIter < ' a > {
36+ type Item = & ' a str ;
37+
38+ fn next ( & mut self ) -> Option < Self :: Item > {
39+ if self . byte_pos >= self . text . len ( ) {
40+ return None ;
41+ }
42+ let rest = & self . text [ self . byte_pos ..] ;
43+ let first = rest. chars ( ) . next ( ) ?;
44+ let end_byte = if first. is_alphanumeric ( ) || first == '_' {
45+ rest. char_indices ( )
46+ . take_while ( |( _, c) | c. is_alphanumeric ( ) || * c == '_' )
47+ . last ( )
48+ . map ( |( i, c) | i + c. len_utf8 ( ) )
49+ . unwrap_or ( first. len_utf8 ( ) )
50+ } else {
51+ first. len_utf8 ( )
52+ } ;
53+ let token = & rest[ ..end_byte] ;
54+ self . byte_pos += end_byte;
55+ Some ( token)
56+ }
57+ }
2958
30- impl < ' a > TokenSource for CharSlice < ' a > {
31- type Token = char ;
32- type Tokenizer = std:: iter:: Copied < std:: slice:: Iter < ' a , char > > ;
59+ /// A `TokenSource` that yields word-level tokens from a `&str`.
60+ /// Used by `intra_line_changes` so the Myers diff operates on whole words
61+ /// rather than individual characters.
62+ struct WordSlice < ' a > ( & ' a str ) ;
63+
64+ impl < ' a > TokenSource for WordSlice < ' a > {
65+ type Token = & ' a str ;
66+ type Tokenizer = WordTokenIter < ' a > ;
3367
3468 fn tokenize ( & self ) -> Self :: Tokenizer {
35- self . 0 . iter ( ) . copied ( )
69+ WordTokenIter {
70+ text : self . 0 ,
71+ byte_pos : 0 ,
72+ }
3673 }
3774
3875 fn estimate_tokens ( & self ) -> u32 {
39- self . 0 . len ( ) as u32
76+ ( self . 0 . len ( ) / 4 ) . max ( 1 ) as u32
77+ }
78+ }
79+
80+ /// Returns the char-index (within `text`) at which each word token starts.
81+ /// `result[i]` is the char start of token `i`; `result.len()` equals the token count.
82+ /// If `tok < result.len()`, the token starts at `result[tok]` and ends at
83+ /// `result[tok + 1]` (or at `text.chars().count()` for the last token).
84+ fn word_token_char_starts ( text : & str ) -> Vec < usize > {
85+ let mut starts = Vec :: new ( ) ;
86+ let mut char_pos = 0usize ;
87+ let mut byte_pos = 0usize ;
88+ while byte_pos < text. len ( ) {
89+ let rest = & text[ byte_pos..] ;
90+ let first = rest. chars ( ) . next ( ) . unwrap ( ) ;
91+ starts. push ( char_pos) ;
92+ let end_byte = if first. is_alphanumeric ( ) || first == '_' {
93+ rest. char_indices ( )
94+ . take_while ( |( _, c) | c. is_alphanumeric ( ) || * c == '_' )
95+ . last ( )
96+ . map ( |( i, c) | i + c. len_utf8 ( ) )
97+ . unwrap_or ( first. len_utf8 ( ) )
98+ } else {
99+ first. len_utf8 ( )
100+ } ;
101+ char_pos += rest[ ..end_byte] . chars ( ) . count ( ) ;
102+ byte_pos += end_byte;
40103 }
104+ starts
41105}
42106
43107/// A diff session pairs two views for side-by-side diff comparison.
@@ -52,7 +116,7 @@ pub struct DiffSession {
52116 /// Shared hunk list. Stored as Arc so callers can take a cheap reference-counted
53117 /// snapshot for closures and annotations without cloning the full Vec each frame.
54118 hunks : Arc < Vec < Hunk > > ,
55- /// Character -level diff results cached per hunk, parallel to `hunks`.
119+ /// Word -level intra-line diff results cached per hunk, parallel to `hunks`.
56120 /// Stored as Arc so render closures can take a cheap snapshot without cloning.
57121 /// Populated in `compute_hunks` and replaced on each recomputation.
58122 /// Pure insertions/deletions store empty vecs (no character diff needed).
@@ -324,8 +388,10 @@ impl InlineChange {
324388 }
325389}
326390
327- /// Compute character-level diff for a single hunk.
328- /// Returns per-line column ranges for each side indicating which characters changed.
391+ /// Compute word-level intra-line diff for a single hunk.
392+ /// Returns per-line column ranges for each side indicating which words (or
393+ /// non-word chars) changed. Whole alphanumeric+underscore runs are treated as
394+ /// one token, so only the changed words are highlighted rather than individual chars.
329395pub fn intra_line_changes (
330396 rope_a : & Rope ,
331397 rope_b : & Rope ,
@@ -339,14 +405,24 @@ pub fn intra_line_changes(
339405 let text_a: String = rope_a. slice ( a_start..a_end) . into ( ) ;
340406 let text_b: String = rope_b. slice ( b_start..b_end) . into ( ) ;
341407
342- // Use char-level tokenization so hunk offsets are char indices, not line indices.
343- let chars_a: Vec < char > = text_a. chars ( ) . collect ( ) ;
344- let chars_b: Vec < char > = text_b. chars ( ) . collect ( ) ;
345- let input = InternedInput :: new ( CharSlice ( & chars_a) , CharSlice ( & chars_b) ) ;
408+ // Precompute char-start of each word token so token indices can be mapped back
409+ // to char-column ranges after the Myers diff produces token-level hunks.
410+ let tok_starts_a = word_token_char_starts ( & text_a) ;
411+ let tok_starts_b = word_token_char_starts ( & text_b) ;
412+ let total_chars_a = text_a. chars ( ) . count ( ) ;
413+ let total_chars_b = text_b. chars ( ) . count ( ) ;
414+
415+ let input = InternedInput :: new ( WordSlice ( & text_a) , WordSlice ( & text_b) ) ;
346416 let diff = Diff :: compute ( Algorithm :: Myers , & input) ;
347417
348- let char_to_line_col = |base_char : usize , offset : u32 , rope : & Rope | -> ( usize , usize ) {
349- let char_idx = base_char + offset as usize ;
418+ // Convert a token index into a char offset within the hunk text.
419+ // `tok` == number of tokens means "end of text".
420+ let tok_to_char = |starts : & [ usize ] , total : usize , tok : u32 | -> usize {
421+ * starts. get ( tok as usize ) . unwrap_or ( & total)
422+ } ;
423+
424+ let char_to_line_col = |base_char : usize , char_offset : usize , rope : & Rope | -> ( usize , usize ) {
425+ let char_idx = base_char + char_offset;
350426 let line = rope. char_to_line ( char_idx) ;
351427 let line_start = rope. line_to_char ( line) ;
352428 ( line, char_idx - line_start)
@@ -355,10 +431,12 @@ pub fn intra_line_changes(
355431 let mut changes_a = Vec :: new ( ) ;
356432 let mut changes_b = Vec :: new ( ) ;
357433
358- for char_hunk in diff. hunks ( ) {
359- if !char_hunk. before . is_empty ( ) {
360- let ( start_line, start_col) = char_to_line_col ( a_start, char_hunk. before . start , rope_a) ;
361- let ( end_line, end_col) = char_to_line_col ( a_start, char_hunk. before . end , rope_a) ;
434+ for tok_hunk in diff. hunks ( ) {
435+ if !tok_hunk. before . is_empty ( ) {
436+ let sc = tok_to_char ( & tok_starts_a, total_chars_a, tok_hunk. before . start ) ;
437+ let ec = tok_to_char ( & tok_starts_a, total_chars_a, tok_hunk. before . end ) ;
438+ let ( start_line, start_col) = char_to_line_col ( a_start, sc, rope_a) ;
439+ let ( end_line, end_col) = char_to_line_col ( a_start, ec, rope_a) ;
362440 // Split across lines if the change spans multiple lines
363441 for line in start_line..=end_line {
364442 let cs = if line == start_line { start_col } else { 0 } ;
@@ -374,9 +452,11 @@ pub fn intra_line_changes(
374452 } ) ;
375453 }
376454 }
377- if !char_hunk. after . is_empty ( ) {
378- let ( start_line, start_col) = char_to_line_col ( b_start, char_hunk. after . start , rope_b) ;
379- let ( end_line, end_col) = char_to_line_col ( b_start, char_hunk. after . end , rope_b) ;
455+ if !tok_hunk. after . is_empty ( ) {
456+ let sc = tok_to_char ( & tok_starts_b, total_chars_b, tok_hunk. after . start ) ;
457+ let ec = tok_to_char ( & tok_starts_b, total_chars_b, tok_hunk. after . end ) ;
458+ let ( start_line, start_col) = char_to_line_col ( b_start, sc, rope_b) ;
459+ let ( end_line, end_col) = char_to_line_col ( b_start, ec, rope_b) ;
380460 for line in start_line..=end_line {
381461 let cs = if line == start_line { start_col } else { 0 } ;
382462 let ce = if line == end_line {
@@ -948,4 +1028,49 @@ mod tests {
9481028 "cache must reflect updated content"
9491029 ) ;
9501030 }
1031+
1032+ #[ test]
1033+ fn intra_line_diff_uses_word_granularity ( ) {
1034+ // "hello world\n" vs "hello earth\n": only the word "world"/"earth" differs.
1035+ // Word-level diff should highlight exactly col 6..11 on each side,
1036+ // not multiple sub-word spans as character-level diff would produce.
1037+ let rope_a = Rope :: from ( "hello world\n " ) ;
1038+ let rope_b = Rope :: from ( "hello earth\n " ) ;
1039+ let hunk = helix_vcs:: Hunk {
1040+ before : 0 ..1 ,
1041+ after : 0 ..1 ,
1042+ } ;
1043+ let ( changes_a, changes_b) = intra_line_changes ( & rope_a, & rope_b, & hunk) ;
1044+
1045+ assert_eq ! (
1046+ changes_a. len( ) ,
1047+ 1 ,
1048+ "expected one word-level change on A side"
1049+ ) ;
1050+ assert_eq ! (
1051+ changes_b. len( ) ,
1052+ 1 ,
1053+ "expected one word-level change on B side"
1054+ ) ;
1055+
1056+ assert_eq ! ( changes_a[ 0 ] . doc_line, 0 ) ;
1057+ assert_eq ! (
1058+ changes_a[ 0 ] . col_start, 6 ,
1059+ "change should start at 'w' of 'world'"
1060+ ) ;
1061+ assert_eq ! (
1062+ changes_a[ 0 ] . col_end, 11 ,
1063+ "change should end after 'd' of 'world'"
1064+ ) ;
1065+
1066+ assert_eq ! ( changes_b[ 0 ] . doc_line, 0 ) ;
1067+ assert_eq ! (
1068+ changes_b[ 0 ] . col_start, 6 ,
1069+ "change should start at 'e' of 'earth'"
1070+ ) ;
1071+ assert_eq ! (
1072+ changes_b[ 0 ] . col_end, 11 ,
1073+ "change should end after 'h' of 'earth'"
1074+ ) ;
1075+ }
9511076}
0 commit comments