Skip to content

Commit 1f22d9f

Browse files
authored
Merge pull request #649 from tneotia/bugfix/leading-whitespaces
Fix leading whitespaces issues
2 parents 1ee8a8e + 12b48b1 commit 1f22d9f

File tree

2 files changed

+98
-21
lines changed

2 files changed

+98
-21
lines changed

lib/html_parser.dart

Lines changed: 64 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -538,8 +538,7 @@ class HtmlParser extends StatelessWidget {
538538
/// between and among inline elements. It does so by creating a boolean [Context]
539539
/// and passing it to the [_processInlineWhitespaceRecursive] function.
540540
static StyledElement _processInlineWhitespace(StyledElement tree) {
541-
final whitespaceParsingContext = Context(false);
542-
tree = _processInlineWhitespaceRecursive(tree, whitespaceParsingContext);
541+
tree = _processInlineWhitespaceRecursive(tree, Context(false));
543542
return tree;
544543
}
545544

@@ -548,33 +547,77 @@ class HtmlParser extends StatelessWidget {
548547
/// to the w3's HTML whitespace processing specification linked to above.
549548
static StyledElement _processInlineWhitespaceRecursive(
550549
StyledElement tree,
551-
Context<bool> wpc,
550+
Context<bool> keepLeadingSpace,
552551
) {
553-
if (tree.style.display == Display.BLOCK) {
554-
wpc.data = false;
555-
}
556-
557-
if (tree is ImageContentElement || tree is SvgContentElement) {
558-
wpc.data = false;
559-
}
560-
561552
if (tree is TextContentElement) {
562-
int index = -1;
563-
if ((tree.element?.nodes.length ?? 0) > 1) {
564-
index = tree.element?.nodes.indexWhere((element) => element == tree.node) ?? -1;
553+
/// initialize indices to negative numbers to make conditionals a little easier
554+
int textIndex = -1;
555+
int elementIndex = -1;
556+
/// initialize parent after to a whitespace to account for elements that are
557+
/// the last child in the list of elements
558+
String parentAfterText = " ";
559+
/// find the index of the text in the current tree
560+
if ((tree.element?.nodes.length ?? 0) >= 1) {
561+
textIndex = tree.element?.nodes.indexWhere((element) => element == tree.node) ?? -1;
562+
}
563+
/// get the parent nodes
564+
dom.NodeList? parentNodes = tree.element?.parent?.nodes;
565+
/// find the index of the tree itself in the parent nodes
566+
if ((parentNodes?.length ?? 0) >= 1) {
567+
elementIndex = parentNodes?.indexWhere((element) => element == tree.element) ?? -1;
565568
}
566-
if (index < 1 && tree.text!.startsWith(' ')
567-
&& tree.element?.localName != "br") {
569+
/// if the tree is any node except the last node in the node list and the
570+
/// next node in the node list is a text node, then get its text. Otherwise
571+
/// the next node will be a [dom.Element], so keep unwrapping that until
572+
/// we get the underlying text node, and finally get its text.
573+
if (elementIndex < (parentNodes?.length ?? 1) - 1 && parentNodes?[elementIndex + 1] is dom.Text) {
574+
parentAfterText = parentNodes?[elementIndex + 1].text ?? " ";
575+
} else if (elementIndex < (parentNodes?.length ?? 1) - 1) {
576+
var parentAfter = parentNodes?[elementIndex + 1];
577+
while (parentAfter is dom.Element) {
578+
if (parentAfter.nodes.isNotEmpty) {
579+
parentAfter = parentAfter.nodes.first;
580+
} else {
581+
break;
582+
}
583+
}
584+
parentAfterText = parentAfter?.text ?? " ";
585+
}
586+
/// If the text is the first element in the current tree node list, it
587+
/// starts with a whitespace, it isn't a line break, and either the
588+
/// whitespace is unnecessary or it is a block element, delete it.
589+
///
590+
/// We should also delete the whitespace at any point in the node list
591+
/// if the previous element is a <br> because that tag makes the element
592+
/// act like a block element.
593+
if (textIndex < 1
594+
&& tree.text!.startsWith(' ')
595+
&& tree.element?.localName != "br"
596+
&& (!keepLeadingSpace.data
597+
|| BLOCK_ELEMENTS.contains(tree.element?.localName ?? ""))
598+
) {
599+
tree.text = tree.text!.replaceFirst(' ', '');
600+
} else if (textIndex >= 1
601+
&& tree.text!.startsWith(' ')
602+
&& tree.element?.nodes[textIndex - 1] is dom.Element
603+
&& (tree.element?.nodes[textIndex - 1] as dom.Element).localName == "br"
604+
) {
568605
tree.text = tree.text!.replaceFirst(' ', '');
569606
}
570-
if (index == (tree.element?.nodes.length ?? 1) - 1
571-
&& (tree.text!.endsWith(' ') || tree.text!.endsWith('\n'))
572-
&& tree.element?.localName != "br") {
573-
tree.text = tree.text!.trimRight();
607+
/// If the text is the last element in the current tree node list, it isn't
608+
/// a line break, and the next text node starts with a whitespace,
609+
/// update the [Context] to signify to that next text node whether it should
610+
/// keep its whitespace. This is based on whether the current text ends with a
611+
/// whitespace.
612+
if (textIndex == (tree.element?.nodes.length ?? 1) - 1
613+
&& tree.element?.localName != "br"
614+
&& parentAfterText.startsWith(' ')
615+
) {
616+
keepLeadingSpace.data = !tree.text!.endsWith(' ');
574617
}
575618
}
576619

577-
tree.children.forEach((e) => _processInlineWhitespaceRecursive(e, wpc));
620+
tree.children.forEach((e) => _processInlineWhitespaceRecursive(e, keepLeadingSpace));
578621

579622
return tree;
580623
}

lib/src/html_elements.dart

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,40 @@ const STYLED_ELEMENTS = [
7070
"ul",
7171
];
7272

73+
const BLOCK_ELEMENTS = [
74+
"article",
75+
"aside",
76+
"blockquote",
77+
"body",
78+
"center",
79+
"dd",
80+
"div",
81+
"dl",
82+
"dt",
83+
"figcaption",
84+
"figure",
85+
"footer",
86+
"h1",
87+
"h2",
88+
"h3",
89+
"h4",
90+
"h5",
91+
"h6",
92+
"header",
93+
"hr",
94+
"html",
95+
"li",
96+
"main",
97+
"nav",
98+
"noscript",
99+
"ol",
100+
"p",
101+
"pre",
102+
"section",
103+
"summary",
104+
"ul",
105+
];
106+
73107
const INTERACTABLE_ELEMENTS = [
74108
"a",
75109
];

0 commit comments

Comments
 (0)