@@ -19,6 +19,7 @@ use imara_diff::{
19
19
} ;
20
20
use pulldown_cmark_escape:: FmtWriter ;
21
21
use regex:: Regex ;
22
+ use unicode_segmentation:: UnicodeSegmentation ;
22
23
23
24
use crate :: github:: GithubCompare ;
24
25
use crate :: utils:: is_repo_autorized;
@@ -249,18 +250,34 @@ fn process_old_new(
249
250
background-color: rgba(150, 255, 150, 1);
250
251
white-space: pre;
251
252
}}
252
- .removed- line-after {{
253
+ .line-removed -after {{
253
254
color: rgb(220, 0, 0)
254
255
}}
255
- .added- line-after {{
256
+ .line-added -after {{
256
257
color: rgb(0, 73, 0)
257
258
}}
258
- .removed- line-before {{
259
+ .line-removed -before {{
259
260
color: rgb(192, 78, 76)
260
261
}}
261
- .added- line-before {{
262
+ .line-added -before {{
262
263
color: rgb(63, 128, 94)
263
264
}}
265
+ .word-removed-after {{
266
+ color: white;
267
+ background-color: rgb(220, 0, 0);
268
+ }}
269
+ .word-added-after {{
270
+ color: white;
271
+ background-color: rgb(0, 73, 0);
272
+ }}
273
+ .word-removed-before {{
274
+ color: white;
275
+ background-color: rgb(192, 78, 76);
276
+ }}
277
+ .word-added-before {{
278
+ color: white;
279
+ background-color: rgb(63, 128, 94);
280
+ }}
264
281
@media (prefers-color-scheme: dark) {{
265
282
body {{
266
283
background: #0C0C0C;
@@ -277,18 +294,34 @@ fn process_old_new(
277
294
background-color: rgba(70, 120, 70, 1);
278
295
white-space: pre;
279
296
}}
280
- .removed- line-after {{
297
+ .line-removed -after {{
281
298
color: rgba(255, 0, 0, 1);
282
299
}}
283
- .added- line-after {{
300
+ .line-added -after {{
284
301
color: rgba(0, 255, 0, 1);
285
302
}}
286
- .removed- line-before {{
303
+ .line-removed -before {{
287
304
color: rgba(100, 0, 0, 1);
288
305
}}
289
- .added- line-before {{
306
+ .line-added -before {{
290
307
color: rgba(0, 100, 0, 1);
291
308
}}
309
+ .word-removed-after {{
310
+ color: black;
311
+ background-color: rgba(255, 0, 0, 1);
312
+ }}
313
+ .word-added-after {{
314
+ color: black;
315
+ background-color: rgba(0, 255, 0, 1);
316
+ }}
317
+ .word-removed-before {{
318
+ color: black;
319
+ background-color: rgba(100, 0, 0, 1);
320
+ }}
321
+ .word-added-before {{
322
+ color: black;
323
+ background-color: rgba(0, 100, 0, 1);
324
+ }}
292
325
}}
293
326
</style>
294
327
</head>
@@ -400,6 +433,7 @@ fn process_old_new(
400
433
const REMOVED_BLOCK_SIGN : & str = r#"<span class="removed-block"> - </span>"# ;
401
434
const ADDED_BLOCK_SIGN : & str = r#"<span class="added-block"> + </span>"# ;
402
435
436
+ #[ derive( Copy , Clone ) ]
403
437
enum HunkTokenStatus {
404
438
Added ,
405
439
Removed ,
@@ -408,39 +442,56 @@ enum HunkTokenStatus {
408
442
struct HtmlDiffPrinter < ' a > ( pub & ' a Interner < & ' a str > ) ;
409
443
410
444
impl HtmlDiffPrinter < ' _ > {
411
- fn handle_hunk_token (
445
+ fn handle_hunk_line < ' a > (
412
446
& self ,
413
447
mut f : impl fmt:: Write ,
414
448
hunk_token_status : HunkTokenStatus ,
415
- token : & str ,
449
+ words : impl Iterator < Item = ( & ' a str , bool ) > ,
416
450
) -> fmt:: Result {
417
451
// Show the hunk status
418
452
match hunk_token_status {
419
453
HunkTokenStatus :: Added => write ! ( f, "{ADDED_BLOCK_SIGN} " ) ?,
420
454
HunkTokenStatus :: Removed => write ! ( f, "{REMOVED_BLOCK_SIGN} " ) ?,
421
455
} ;
422
456
423
- let is_add = token. starts_with ( '+' ) ;
424
- let is_remove = token. starts_with ( '-' ) ;
457
+ let mut words = words. peekable ( ) ;
458
+
459
+ let first_word = words. peek ( ) ;
460
+ let is_add = first_word. map ( |w| w. 0 . starts_with ( '+' ) ) . unwrap_or_default ( ) ;
461
+ let is_remove = first_word. map ( |w| w. 0 . starts_with ( '-' ) ) . unwrap_or_default ( ) ;
425
462
426
463
// Highlight in the same was as `git range-diff` does for diff-lines
427
- // that changed. (Contrary to `git range-diff` we don't color unchanged
464
+ // that changed. In addition we also do word highlighting.
465
+ //
466
+ // (Contrary to `git range-diff` we don't color unchanged
428
467
// diff lines though, since then the coloring distracts from what is
429
468
// relevant.)
430
469
if is_add || is_remove {
431
- let class = match ( hunk_token_status, is_add) {
432
- ( HunkTokenStatus :: Removed , true ) => "added-line- before" ,
433
- ( HunkTokenStatus :: Removed , false ) => "removed-line- before" ,
434
- ( HunkTokenStatus :: Added , true ) => "added-line- after" ,
435
- ( HunkTokenStatus :: Added , false ) => "removed-line- after" ,
470
+ let prefix_class = match ( hunk_token_status, is_add) {
471
+ ( HunkTokenStatus :: Removed , true ) => "added-before" ,
472
+ ( HunkTokenStatus :: Removed , false ) => "removed-before" ,
473
+ ( HunkTokenStatus :: Added , true ) => "added-after" ,
474
+ ( HunkTokenStatus :: Added , false ) => "removed-after" ,
436
475
} ;
476
+ write ! ( f, r#"<span class="line-{prefix_class}">"# ) ?;
477
+
478
+ for ( word, changed) in words {
479
+ if changed {
480
+ write ! ( f, r#"<span class="word-{prefix_class}">"# ) ?;
481
+ pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , word) ?;
482
+ write ! ( f, "</span>" ) ?;
483
+ } else {
484
+ pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , word) ?;
485
+ }
486
+ }
437
487
438
- write ! ( f, r#"<span class="{class}">"# ) ?;
439
- pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , token) ?;
440
488
write ! ( f, "</span>" ) ?;
441
489
} else {
442
- pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , token) ?;
490
+ for ( word, _status) in words {
491
+ pulldown_cmark_escape:: escape_html ( FmtWriter ( & mut f) , word) ?;
492
+ }
443
493
}
494
+
444
495
Ok ( ( ) )
445
496
}
446
497
}
@@ -474,23 +525,82 @@ impl UnifiedDiffPrinter for HtmlDiffPrinter<'_> {
474
525
before : & [ Token ] ,
475
526
after : & [ Token ] ,
476
527
) -> fmt:: Result {
477
- if let Some ( & last) = before. last ( ) {
478
- for & token in before {
479
- let token = self . 0 [ token] ;
480
- self . handle_hunk_token ( & mut f, HunkTokenStatus :: Removed , token) ?;
528
+ // To improve on the line-by-line diff we also want to do a sort of `git --words-diff`
529
+ // (aka word highlighting). To achieve word highlighting, we only consider hunk that
530
+ // have the same number of lines removed and added, otherwise it's much more complex
531
+ // to link the changes together.
532
+
533
+ if before. len ( ) == after. len ( ) {
534
+ // Same number of lines before and after, can do word-hightling.
535
+
536
+ // Diff the individual lines together.
537
+ let diffs_and_inputs: Vec < _ > = before
538
+ . into_iter ( )
539
+ . zip ( after. into_iter ( ) )
540
+ . map ( |( b_token, a_token) | {
541
+ // Split both lines by words and intern them.
542
+ let input: InternedInput < & str > = InternedInput :: new (
543
+ SplitWordBoundaries ( self . 0 [ * b_token] ) ,
544
+ SplitWordBoundaries ( self . 0 [ * a_token] ) ,
545
+ ) ;
546
+
547
+ // Compute the (word) diff
548
+ let diff = Diff :: compute ( Algorithm :: Histogram , & input) ;
549
+
550
+ ( diff, input)
551
+ } )
552
+ . collect ( ) ;
553
+
554
+ // Process all before lines first
555
+ for ( diff, input) in diffs_and_inputs. iter ( ) {
556
+ self . handle_hunk_line (
557
+ & mut f,
558
+ HunkTokenStatus :: Removed ,
559
+ input. before . iter ( ) . enumerate ( ) . map ( |( b_pos, b_token) | {
560
+ ( input. interner [ * b_token] , diff. is_removed ( b_pos as u32 ) )
561
+ } ) ,
562
+ ) ?;
481
563
}
482
- if !self . 0 [ last] . ends_with ( '\n' ) {
483
- writeln ! ( f) ?;
484
- }
485
- }
486
564
487
- if let Some ( & last) = after. last ( ) {
488
- for & token in after {
489
- let token = self . 0 [ token] ;
490
- self . handle_hunk_token ( & mut f, HunkTokenStatus :: Added , token) ?;
565
+ // Then process all after lines
566
+ for ( diff, input) in diffs_and_inputs. iter ( ) {
567
+ self . handle_hunk_line (
568
+ & mut f,
569
+ HunkTokenStatus :: Added ,
570
+ input. after . iter ( ) . enumerate ( ) . map ( |( a_pos, a_token) | {
571
+ ( input. interner [ * a_token] , diff. is_added ( a_pos as u32 ) )
572
+ } ) ,
573
+ ) ?;
574
+ }
575
+ } else {
576
+ // Can't do word-highlighting, simply print each line.
577
+
578
+ if let Some ( & last) = before. last ( ) {
579
+ for & token in before {
580
+ let token = self . 0 [ token] ;
581
+ self . handle_hunk_line (
582
+ & mut f,
583
+ HunkTokenStatus :: Removed ,
584
+ std:: iter:: once ( ( token, false ) ) ,
585
+ ) ?;
586
+ }
587
+ if !self . 0 [ last] . ends_with ( '\n' ) {
588
+ writeln ! ( f) ?;
589
+ }
491
590
}
492
- if !self . 0 [ last] . ends_with ( '\n' ) {
493
- writeln ! ( f) ?;
591
+
592
+ if let Some ( & last) = after. last ( ) {
593
+ for & token in after {
594
+ let token = self . 0 [ token] ;
595
+ self . handle_hunk_line (
596
+ & mut f,
597
+ HunkTokenStatus :: Added ,
598
+ std:: iter:: once ( ( token, false ) ) ,
599
+ ) ?;
600
+ }
601
+ if !self . 0 [ last] . ends_with ( '\n' ) {
602
+ writeln ! ( f) ?;
603
+ }
494
604
}
495
605
}
496
606
Ok ( ( ) )
@@ -514,3 +624,20 @@ fn bookmarklet(host: &str) -> String {
514
624
}})();"
515
625
)
516
626
}
627
+
628
+ // Simple abstraction over `unicode_segmentation::split_word_bounds` for `imara_diff::TokenSource`
629
+ struct SplitWordBoundaries < ' a > ( & ' a str ) ;
630
+
631
+ impl < ' a > imara_diff:: TokenSource for SplitWordBoundaries < ' a > {
632
+ type Token = & ' a str ;
633
+ type Tokenizer = unicode_segmentation:: UWordBounds < ' a > ;
634
+
635
+ fn tokenize ( & self ) -> Self :: Tokenizer {
636
+ self . 0 . split_word_bounds ( )
637
+ }
638
+
639
+ fn estimate_tokens ( & self ) -> u32 {
640
+ // https://www.wyliecomm.com/2021/11/whats-the-best-length-of-a-word-online/
641
+ ( self . 0 . len ( ) as f32 / 4.7f32 ) as u32
642
+ }
643
+ }
0 commit comments