Skip to content

Commit 26cc100

Browse files
committed
Whitespace handling in textnodes
1 parent 2636d39 commit 26cc100

File tree

10 files changed

+225
-94
lines changed

10 files changed

+225
-94
lines changed

README.md

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,15 @@ var diffs = DiffBuilder
133133
#### CSS selector-cross tree matcher (node, attr)
134134

135135
### Compare options
136-
137-
#### Name/Type matcher (node, attr)
138-
#### Content matcher (text, attr)
139-
#### Content regex matcher (text, attr)
140-
#### IgnoreCase content matcher (text, attr)
136+
#### Name/Type comparer (node, attr)
137+
#### Content comparer (text, attr)
138+
#### Content regex comparer (text, attr)
139+
#### IgnoreCase content comparer (text, attr)
140+
#### Class attribute comparer (attr)
141+
#### Boolean-attribute comparer (attr)
142+
See rules at https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#boolean-attributes
143+
https://www.w3.org/TR/html52/infrastructure.html#sec-boolean-attributes
144+
https://gist.github.com/ArjanSchouten/0b8574a6ad7f5065a5e7
141145

142146
### Ignoring special `diff:` attributes
143147
Any attributes that starts with `diff:` are automatically filtered out before matching/comparing happens. E.g. `diff:whitespace="..."` does not show up as a missing diff when added to an control element.

src/CompareResult.cs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,13 @@ public enum CompareResult
1919
/// </summary>
2020
DifferentAndBreak
2121
}
22+
23+
public static class CompareResultExtensions
24+
{
25+
public static bool IsSame(this CompareResult compareResult) => compareResult == CompareResult.Same;
26+
public static bool IsSameAndBreak(this CompareResult compareResult) => compareResult == CompareResult.SameAndBreak;
27+
public static bool IsDifferent(this CompareResult compareResult) => compareResult == CompareResult.Different;
28+
public static bool IsDifferentAndBreak(this CompareResult compareResult) => compareResult == CompareResult.DifferentAndBreak;
29+
}
2230
}
31+
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
using System;
2+
using System.Text.RegularExpressions;
3+
using AngleSharp.Dom;
4+
using Egil.AngleSharp.Diffing.Core;
5+
6+
namespace Egil.AngleSharp.Diffing.Strategies.TextNodeStrategies
7+
{
8+
public class TextNodeComparer
9+
{
10+
private static readonly Regex WhitespaceReplace = new Regex(@"\s+", RegexOptions.Compiled | RegexOptions.CultureInvariant, TimeSpan.FromSeconds(5));
11+
12+
public WhitespaceOption Option { get; }
13+
14+
public TextNodeComparer(WhitespaceOption option)
15+
{
16+
Option = option;
17+
}
18+
19+
public CompareResult Compare(in Comparison comparison, CompareResult currentDecision)
20+
{
21+
if (currentDecision.IsSame() || currentDecision.IsSameAndBreak()) return currentDecision;
22+
if (Option != WhitespaceOption.Normalize) return currentDecision;
23+
24+
if (comparison.Control.Node is IText controlTextNode && comparison.Test.Node is IText testTextNode)
25+
{
26+
var controlText = WhitespaceReplace.Replace(controlTextNode.Data.Trim(), " ");
27+
var testText = WhitespaceReplace.Replace(testTextNode.Data.Trim(), " ");
28+
29+
if (controlText.Equals(testText, StringComparison.Ordinal)) return CompareResult.Same;
30+
}
31+
32+
return currentDecision;
33+
}
34+
}
35+
}

src/Strategies/WhitespaceStrategy.cs renamed to src/Strategies/TextNodeStrategies/TextNodeFilter.cs

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,13 @@
66
using AngleSharp.Dom;
77
using Egil.AngleSharp.Diffing.Core;
88

9-
namespace Egil.AngleSharp.Diffing.Strategies
9+
namespace Egil.AngleSharp.Diffing.Strategies.TextNodeStrategies
1010
{
11-
public enum WhitespaceOption
12-
{
13-
Preserve = 0,
14-
RemoveWhitespaceNodes,
15-
Normalize
16-
}
17-
18-
public class WhitespaceStrategy
11+
public class TextNodeFilter
1912
{
2013
public WhitespaceOption Option { get; }
2114

22-
public WhitespaceStrategy(WhitespaceOption option)
15+
public TextNodeFilter(WhitespaceOption option)
2316
{
2417
Option = option;
2518
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
namespace Egil.AngleSharp.Diffing.Strategies.TextNodeStrategies
2+
{
3+
public enum WhitespaceOption
4+
{
5+
Preserve = 0,
6+
RemoveWhitespaceNodes,
7+
Normalize
8+
}
9+
}

tests/Core/DiffingTestBase.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ protected INode ToNode(string htmlsnippet)
3737
return fragment[0];
3838
}
3939

40-
protected ComparisonSource ToComparisonSource(string html)
40+
protected ComparisonSource ToComparisonSource(string html, ComparisonSourceType sourceType = ComparisonSourceType.Control)
4141
{
42-
return ToNode(html).ToComparisonSource(0, ComparisonSourceType.Control);
42+
return ToNode(html).ToComparisonSource(0, sourceType);
4343
}
4444

4545
protected static HtmlDifferenceEngine CreateHtmlDiffEngine(
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
using Egil.AngleSharp.Diffing.Core;
2+
using Shouldly;
3+
using Xunit;
4+
5+
namespace Egil.AngleSharp.Diffing.Strategies.TextNodeStrategies
6+
{
7+
public class TextNodeComparerTest : TextnodeStrategyTestBase
8+
{
9+
[Theory(DisplayName = "When option is Preserve or RemoveWhitespaceNodes, comparer does not run nor change the current decision")]
10+
[InlineData(WhitespaceOption.Preserve)]
11+
[InlineData(WhitespaceOption.RemoveWhitespaceNodes)]
12+
public void Test5(WhitespaceOption whitespaceOption)
13+
{
14+
var comparison = new Comparison(ToComparisonSource("hello world", ComparisonSourceType.Control), ToComparisonSource(" hello world ", ComparisonSourceType.Test));
15+
var sut = new TextNodeComparer(whitespaceOption);
16+
17+
sut.Compare(comparison, CompareResult.Different).ShouldBe(CompareResult.Different);
18+
sut.Compare(comparison, CompareResult.DifferentAndBreak).ShouldBe(CompareResult.DifferentAndBreak);
19+
sut.Compare(comparison, CompareResult.Same).ShouldBe(CompareResult.Same);
20+
sut.Compare(comparison, CompareResult.SameAndBreak).ShouldBe(CompareResult.SameAndBreak);
21+
}
22+
23+
[Fact(DisplayName = "When option is Normalize and current decision is Same or SameAndBreak, compare uses the current decision")]
24+
public void Test55()
25+
{
26+
var comparison = new Comparison();
27+
var sut = new TextNodeComparer(WhitespaceOption.Normalize);
28+
sut.Compare(comparison, CompareResult.Same).ShouldBe(CompareResult.Same);
29+
sut.Compare(comparison, CompareResult.SameAndBreak).ShouldBe(CompareResult.SameAndBreak);
30+
}
31+
32+
[Theory(DisplayName = "When option is Normalize, any whitespace before and after a text node is removed before comparison")]
33+
[MemberData(nameof(WhitespaceCharStrings))]
34+
public void Test7(string whitespace)
35+
{
36+
var sut = new TextNodeComparer(WhitespaceOption.Normalize);
37+
var normalText = "text";
38+
var whitespaceText = $"{whitespace}text{whitespace}";
39+
var c1 = new Comparison(ToComparisonSource(normalText, ComparisonSourceType.Control), ToComparisonSource(normalText, ComparisonSourceType.Test));
40+
var c2 = new Comparison(ToComparisonSource(normalText, ComparisonSourceType.Control), ToComparisonSource(whitespaceText, ComparisonSourceType.Test));
41+
var c3 = new Comparison(ToComparisonSource(whitespaceText, ComparisonSourceType.Control), ToComparisonSource(normalText, ComparisonSourceType.Test));
42+
var c4 = new Comparison(ToComparisonSource(whitespaceText, ComparisonSourceType.Control), ToComparisonSource(whitespaceText, ComparisonSourceType.Test));
43+
44+
sut.Compare(c1, CompareResult.Different).ShouldBe(CompareResult.Same);
45+
sut.Compare(c2, CompareResult.Different).ShouldBe(CompareResult.Same);
46+
sut.Compare(c3, CompareResult.Different).ShouldBe(CompareResult.Same);
47+
sut.Compare(c4, CompareResult.Different).ShouldBe(CompareResult.Same);
48+
}
49+
50+
[Theory(DisplayName = "When option is Normalize, any consecutive whitespace characters are collapsed into one before comparison")]
51+
[MemberData(nameof(WhitespaceCharStrings))]
52+
public void Test9(string whitespace)
53+
{
54+
var sut = new TextNodeComparer(WhitespaceOption.Normalize);
55+
var normalText = "hello world";
56+
var whitespaceText = $"{whitespace}hello{whitespace}{whitespace}world{whitespace}";
57+
var c1 = new Comparison(ToComparisonSource(normalText, ComparisonSourceType.Control), ToComparisonSource(normalText, ComparisonSourceType.Test));
58+
var c2 = new Comparison(ToComparisonSource(normalText, ComparisonSourceType.Control), ToComparisonSource(whitespaceText, ComparisonSourceType.Test));
59+
var c3 = new Comparison(ToComparisonSource(whitespaceText, ComparisonSourceType.Control), ToComparisonSource(normalText, ComparisonSourceType.Test));
60+
var c4 = new Comparison(ToComparisonSource(whitespaceText, ComparisonSourceType.Control), ToComparisonSource(whitespaceText, ComparisonSourceType.Test));
61+
62+
sut.Compare(c1, CompareResult.Different).ShouldBe(CompareResult.Same);
63+
sut.Compare(c2, CompareResult.Different).ShouldBe(CompareResult.Same);
64+
sut.Compare(c3, CompareResult.Different).ShouldBe(CompareResult.Same);
65+
sut.Compare(c4, CompareResult.Different).ShouldBe(CompareResult.Same);
66+
}
67+
68+
// When a parent node has overridden the global whitespace option, that overridden option is used
69+
// When whitespace option is Preserve or RemoveWhitespaceNodes, a string ordinal comparison is performed
70+
// When whitespace option is Preserve or RemoveWhitespaceNodes and IgnoreCase is true, a string ordinal ignore case comparison is performed
71+
// When IgnoreCase is true, a case insensitve comparison is performed
72+
// When the parent element is <pre>, the is implicitly set to Preserve, unless explicitly overridden on the element
73+
// When diff:regex attribute is found on the containing element, the control text is expected to a regex and that used when comparing to the test text node.
74+
}
75+
}
76+
77+
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
using Shouldly;
2+
using Xunit;
3+
4+
namespace Egil.AngleSharp.Diffing.Strategies.TextNodeStrategies
5+
{
6+
public class TextNodeFilterTest : TextnodeStrategyTestBase
7+
{
8+
[Theory(DisplayName = "When whitespace option is Preserve, the provided decision is not changed by the filter for whitespace only text nodes")]
9+
[MemberData(nameof(WhitespaceCharStrings))]
10+
public void Test1(string whitespace)
11+
{
12+
var sut = new TextNodeFilter(WhitespaceOption.Preserve);
13+
var source = ToComparisonSource(whitespace);
14+
15+
sut.Filter(source, FilterDecision.Keep).ShouldBe(FilterDecision.Keep);
16+
sut.Filter(source, FilterDecision.Exclude).ShouldBe(FilterDecision.Exclude);
17+
}
18+
19+
[Theory(DisplayName = "When whitespace option is RemoveWhitespaceNodes, whitespace only text nodes are excluded during filtering")]
20+
[MemberData(nameof(WhitespaceCharStrings))]
21+
public void Test2(string whitespace)
22+
{
23+
var sut = new TextNodeFilter(WhitespaceOption.RemoveWhitespaceNodes);
24+
var source = ToComparisonSource(whitespace);
25+
26+
sut.Filter(source, FilterDecision.Keep).ShouldBe(FilterDecision.Exclude);
27+
}
28+
29+
[Theory(DisplayName = "When whitespace option is Normalize, whitespace only text nodes are excluded during filtering")]
30+
[MemberData(nameof(WhitespaceCharStrings))]
31+
public void Test3(string whitespace)
32+
{
33+
var sut = new TextNodeFilter(WhitespaceOption.Normalize);
34+
var source = ToComparisonSource(whitespace);
35+
36+
sut.Filter(source, FilterDecision.Keep).ShouldBe(FilterDecision.Exclude);
37+
}
38+
39+
[Theory(DisplayName = "Filter method doesn't change the decision of non-whitespace nodes or non-text nodes")]
40+
[InlineData("hello world")]
41+
[InlineData("<p>hello world</p>")]
42+
public void Test4x(string html)
43+
{
44+
var sut = new TextNodeFilter(WhitespaceOption.Normalize);
45+
var source = ToComparisonSource(html);
46+
47+
sut.Filter(source, FilterDecision.Keep).ShouldBe(FilterDecision.Keep);
48+
sut.Filter(source, FilterDecision.Exclude).ShouldBe(FilterDecision.Exclude);
49+
}
50+
51+
// When a parent node has overridden the global whitespace option, that overridden option is used
52+
}
53+
}
54+
55+
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
using Egil.AngleSharp.Diffing.Core;
2+
using System.Collections.Generic;
3+
using System.Globalization;
4+
using System.Linq;
5+
6+
namespace Egil.AngleSharp.Diffing.Strategies.TextNodeStrategies
7+
{
8+
public abstract class TextnodeStrategyTestBase : DiffingTestBase
9+
{
10+
public static readonly char[] AllWhitespaceCharacters = new[]
11+
{
12+
// SpaceSeparator category
13+
'\u0020', '\u00A0', '\u1680', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A', '\u202F', '\u205F', '\u3000',
14+
// LineSeparator category
15+
'\u2028',
16+
//ParagraphSeparator category
17+
'\u2029',
18+
// CHARACTER TABULATION
19+
'\u0009','\u000A','\u000B','\u000C','\u000D','\u0085'
20+
};
21+
22+
public static readonly IEnumerable<object[]> WhitespaceCharStrings = AllWhitespaceCharacters.Select(c => new string[] { c.ToString(CultureInfo.InvariantCulture) }).ToArray();
23+
}
24+
}
25+
26+

tests/Strategies/WhitespaceStrategyTest.cs

Lines changed: 0 additions & 77 deletions
This file was deleted.

0 commit comments

Comments
 (0)