Skip to content

Commit 7ab36ca

Browse files
authored
Merge pull request #2168 from Urgau/gh-range-diff-word-highlighting
Add a simple word-highlighting to our range-diff
2 parents 3cb59ce + 6085762 commit 7ab36ca

File tree

3 files changed

+171
-36
lines changed

3 files changed

+171
-36
lines changed

Cargo.lock

Lines changed: 8 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ sha2 = "0.10.9"
5151
imara-diff = "0.2.0"
5252
pulldown-cmark-escape = "0.11.0"
5353
axum-extra = { version = "0.10.1", default-features = false }
54+
unicode-segmentation = "1.12.0"
5455

5556
[dependencies.serde]
5657
version = "1"

src/gh_range_diff.rs

Lines changed: 162 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use imara_diff::{
1919
};
2020
use pulldown_cmark_escape::FmtWriter;
2121
use regex::Regex;
22+
use unicode_segmentation::UnicodeSegmentation;
2223

2324
use crate::github::GithubCompare;
2425
use crate::utils::is_repo_autorized;
@@ -249,18 +250,34 @@ fn process_old_new(
249250
background-color: rgba(150, 255, 150, 1);
250251
white-space: pre;
251252
}}
252-
.removed-line-after {{
253+
.line-removed-after {{
253254
color: rgb(220, 0, 0)
254255
}}
255-
.added-line-after {{
256+
.line-added-after {{
256257
color: rgb(0, 73, 0)
257258
}}
258-
.removed-line-before {{
259+
.line-removed-before {{
259260
color: rgb(192, 78, 76)
260261
}}
261-
.added-line-before {{
262+
.line-added-before {{
262263
color: rgb(63, 128, 94)
263264
}}
265+
.word-removed-after {{
266+
color: white;
267+
background-color: rgb(220, 0, 0);
268+
}}
269+
.word-added-after {{
270+
color: white;
271+
background-color: rgb(0, 73, 0);
272+
}}
273+
.word-removed-before {{
274+
color: white;
275+
background-color: rgb(192, 78, 76);
276+
}}
277+
.word-added-before {{
278+
color: white;
279+
background-color: rgb(63, 128, 94);
280+
}}
264281
@media (prefers-color-scheme: dark) {{
265282
body {{
266283
background: #0C0C0C;
@@ -277,18 +294,34 @@ fn process_old_new(
277294
background-color: rgba(70, 120, 70, 1);
278295
white-space: pre;
279296
}}
280-
.removed-line-after {{
297+
.line-removed-after {{
281298
color: rgba(255, 0, 0, 1);
282299
}}
283-
.added-line-after {{
300+
.line-added-after {{
284301
color: rgba(0, 255, 0, 1);
285302
}}
286-
.removed-line-before {{
303+
.line-removed-before {{
287304
color: rgba(100, 0, 0, 1);
288305
}}
289-
.added-line-before {{
306+
.line-added-before {{
290307
color: rgba(0, 100, 0, 1);
291308
}}
309+
.word-removed-after {{
310+
color: black;
311+
background-color: rgba(255, 0, 0, 1);
312+
}}
313+
.word-added-after {{
314+
color: black;
315+
background-color: rgba(0, 255, 0, 1);
316+
}}
317+
.word-removed-before {{
318+
color: black;
319+
background-color: rgba(100, 0, 0, 1);
320+
}}
321+
.word-added-before {{
322+
color: black;
323+
background-color: rgba(0, 100, 0, 1);
324+
}}
292325
}}
293326
</style>
294327
</head>
@@ -400,6 +433,7 @@ fn process_old_new(
400433
const REMOVED_BLOCK_SIGN: &str = r#"<span class="removed-block"> - </span>"#;
401434
const ADDED_BLOCK_SIGN: &str = r#"<span class="added-block"> + </span>"#;
402435

436+
#[derive(Copy, Clone)]
403437
enum HunkTokenStatus {
404438
Added,
405439
Removed,
@@ -408,39 +442,56 @@ enum HunkTokenStatus {
408442
struct HtmlDiffPrinter<'a>(pub &'a Interner<&'a str>);
409443

410444
impl HtmlDiffPrinter<'_> {
411-
fn handle_hunk_token(
445+
fn handle_hunk_line<'a>(
412446
&self,
413447
mut f: impl fmt::Write,
414448
hunk_token_status: HunkTokenStatus,
415-
token: &str,
449+
words: impl Iterator<Item = (&'a str, bool)>,
416450
) -> fmt::Result {
417451
// Show the hunk status
418452
match hunk_token_status {
419453
HunkTokenStatus::Added => write!(f, "{ADDED_BLOCK_SIGN} ")?,
420454
HunkTokenStatus::Removed => write!(f, "{REMOVED_BLOCK_SIGN} ")?,
421455
};
422456

423-
let is_add = token.starts_with('+');
424-
let is_remove = token.starts_with('-');
457+
let mut words = words.peekable();
458+
459+
let first_word = words.peek();
460+
let is_add = first_word.map(|w| w.0.starts_with('+')).unwrap_or_default();
461+
let is_remove = first_word.map(|w| w.0.starts_with('-')).unwrap_or_default();
425462

426463
// Highlight in the same was as `git range-diff` does for diff-lines
427-
// that changed. (Contrary to `git range-diff` we don't color unchanged
464+
// that changed. In addition we also do word highlighting.
465+
//
466+
// (Contrary to `git range-diff` we don't color unchanged
428467
// diff lines though, since then the coloring distracts from what is
429468
// relevant.)
430469
if is_add || is_remove {
431-
let class = match (hunk_token_status, is_add) {
432-
(HunkTokenStatus::Removed, true) => "added-line-before",
433-
(HunkTokenStatus::Removed, false) => "removed-line-before",
434-
(HunkTokenStatus::Added, true) => "added-line-after",
435-
(HunkTokenStatus::Added, false) => "removed-line-after",
470+
let prefix_class = match (hunk_token_status, is_add) {
471+
(HunkTokenStatus::Removed, true) => "added-before",
472+
(HunkTokenStatus::Removed, false) => "removed-before",
473+
(HunkTokenStatus::Added, true) => "added-after",
474+
(HunkTokenStatus::Added, false) => "removed-after",
436475
};
476+
write!(f, r#"<span class="line-{prefix_class}">"#)?;
477+
478+
for (word, changed) in words {
479+
if changed {
480+
write!(f, r#"<span class="word-{prefix_class}">"#)?;
481+
pulldown_cmark_escape::escape_html(FmtWriter(&mut f), word)?;
482+
write!(f, "</span>")?;
483+
} else {
484+
pulldown_cmark_escape::escape_html(FmtWriter(&mut f), word)?;
485+
}
486+
}
437487

438-
write!(f, r#"<span class="{class}">"#)?;
439-
pulldown_cmark_escape::escape_html(FmtWriter(&mut f), token)?;
440488
write!(f, "</span>")?;
441489
} else {
442-
pulldown_cmark_escape::escape_html(FmtWriter(&mut f), token)?;
490+
for (word, _status) in words {
491+
pulldown_cmark_escape::escape_html(FmtWriter(&mut f), word)?;
492+
}
443493
}
494+
444495
Ok(())
445496
}
446497
}
@@ -474,23 +525,82 @@ impl UnifiedDiffPrinter for HtmlDiffPrinter<'_> {
474525
before: &[Token],
475526
after: &[Token],
476527
) -> fmt::Result {
477-
if let Some(&last) = before.last() {
478-
for &token in before {
479-
let token = self.0[token];
480-
self.handle_hunk_token(&mut f, HunkTokenStatus::Removed, token)?;
528+
// To improve on the line-by-line diff we also want to do a sort of `git --words-diff`
529+
// (aka word highlighting). To achieve word highlighting, we only consider hunk that
530+
// have the same number of lines removed and added, otherwise it's much more complex
531+
// to link the changes together.
532+
533+
if before.len() == after.len() {
534+
// Same number of lines before and after, can do word-hightling.
535+
536+
// Diff the individual lines together.
537+
let diffs_and_inputs: Vec<_> = before
538+
.into_iter()
539+
.zip(after.into_iter())
540+
.map(|(b_token, a_token)| {
541+
// Split both lines by words and intern them.
542+
let input: InternedInput<&str> = InternedInput::new(
543+
SplitWordBoundaries(self.0[*b_token]),
544+
SplitWordBoundaries(self.0[*a_token]),
545+
);
546+
547+
// Compute the (word) diff
548+
let diff = Diff::compute(Algorithm::Histogram, &input);
549+
550+
(diff, input)
551+
})
552+
.collect();
553+
554+
// Process all before lines first
555+
for (diff, input) in diffs_and_inputs.iter() {
556+
self.handle_hunk_line(
557+
&mut f,
558+
HunkTokenStatus::Removed,
559+
input.before.iter().enumerate().map(|(b_pos, b_token)| {
560+
(input.interner[*b_token], diff.is_removed(b_pos as u32))
561+
}),
562+
)?;
481563
}
482-
if !self.0[last].ends_with('\n') {
483-
writeln!(f)?;
484-
}
485-
}
486564

487-
if let Some(&last) = after.last() {
488-
for &token in after {
489-
let token = self.0[token];
490-
self.handle_hunk_token(&mut f, HunkTokenStatus::Added, token)?;
565+
// Then process all after lines
566+
for (diff, input) in diffs_and_inputs.iter() {
567+
self.handle_hunk_line(
568+
&mut f,
569+
HunkTokenStatus::Added,
570+
input.after.iter().enumerate().map(|(a_pos, a_token)| {
571+
(input.interner[*a_token], diff.is_added(a_pos as u32))
572+
}),
573+
)?;
574+
}
575+
} else {
576+
// Can't do word-highlighting, simply print each line.
577+
578+
if let Some(&last) = before.last() {
579+
for &token in before {
580+
let token = self.0[token];
581+
self.handle_hunk_line(
582+
&mut f,
583+
HunkTokenStatus::Removed,
584+
std::iter::once((token, false)),
585+
)?;
586+
}
587+
if !self.0[last].ends_with('\n') {
588+
writeln!(f)?;
589+
}
491590
}
492-
if !self.0[last].ends_with('\n') {
493-
writeln!(f)?;
591+
592+
if let Some(&last) = after.last() {
593+
for &token in after {
594+
let token = self.0[token];
595+
self.handle_hunk_line(
596+
&mut f,
597+
HunkTokenStatus::Added,
598+
std::iter::once((token, false)),
599+
)?;
600+
}
601+
if !self.0[last].ends_with('\n') {
602+
writeln!(f)?;
603+
}
494604
}
495605
}
496606
Ok(())
@@ -514,3 +624,20 @@ fn bookmarklet(host: &str) -> String {
514624
}})();"
515625
)
516626
}
627+
628+
// Simple abstraction over `unicode_segmentation::split_word_bounds` for `imara_diff::TokenSource`
629+
struct SplitWordBoundaries<'a>(&'a str);
630+
631+
impl<'a> imara_diff::TokenSource for SplitWordBoundaries<'a> {
632+
type Token = &'a str;
633+
type Tokenizer = unicode_segmentation::UWordBounds<'a>;
634+
635+
fn tokenize(&self) -> Self::Tokenizer {
636+
self.0.split_word_bounds()
637+
}
638+
639+
fn estimate_tokens(&self) -> u32 {
640+
// https://www.wyliecomm.com/2021/11/whats-the-best-length-of-a-word-online/
641+
(self.0.len() as f32 / 4.7f32) as u32
642+
}
643+
}

0 commit comments

Comments
 (0)