Skip to content

Commit cee7563

Browse files
authored
Reimplement the adoption agency algorithm (#2278)
Fixes #2267 The implementation of the adoption agency algorithm vs the HTML5 spec had drifted over time. This fixes that. The spec for table fostering when in adoption is not presently implemented, marked as a todo. Given the complexity of the algo and how it has changed over time, I opted to include it inline so that hopefully later changes are easier to adopt.
1 parent 1b36b06 commit cee7563

File tree

5 files changed

+208
-67
lines changed

5 files changed

+208
-67
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
* When converting parsed HTML to XML or the W3C DOM, element names containing `<` are normalized to `_` to ensure valid
4545
XML. For example, `<foo<bar>` becomes `<foo_bar>`, as XML does not allow `<` in element names, but HTML5
4646
does. [2276](https://github.com/jhy/jsoup/pull/2276)
47+
* Reimplemented the HTML5 Adoption Agency Algorithm to the current spec. This handles mis-nested formating / structural elements. [2278](https://github.com/jhy/jsoup/pull/2278)
4748

4849
### Bug Fixes
4950

src/main/java/org/jsoup/parser/HtmlTreeBuilder.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ public class HtmlTreeBuilder extends TreeBuilder {
5959
private @Nullable Element headElement; // the current head element
6060
private @Nullable FormElement formElement; // the current form element
6161
private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing
62-
private ArrayList<Element> formattingElements; // active (open) formatting elements
62+
ArrayList<Element> formattingElements; // active (open) formatting elements
6363
private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes
6464
private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out
6565
private Token.EndTag emptyEnd; // reused empty end tag
@@ -539,6 +539,13 @@ private void clearStackToContext(String... nodeNames) {
539539
}
540540
}
541541

542+
/**
543+
Gets the Element immediately above the supplied element on the stack. Which due to adoption, may not necessarily be
544+
its parent.
545+
546+
@param el
547+
@return the Element immediately above the supplied element, or null if there is no such element.
548+
*/
542549
@Nullable Element aboveOnStack(Element el) {
543550
assert onStack(el);
544551
for (int pos = stack.size() -1; pos >= 0; pos--) {

src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java

Lines changed: 119 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
import org.jsoup.nodes.Document;
88
import org.jsoup.nodes.DocumentType;
99
import org.jsoup.nodes.Element;
10+
import org.jsoup.nodes.Node;
1011
import org.jsoup.nodes.Range;
1112

1213
import java.util.ArrayList;
1314

1415
import static org.jsoup.internal.StringUtil.inSorted;
16+
import static org.jsoup.parser.HtmlTreeBuilder.isSpecial;
1517
import static org.jsoup.parser.HtmlTreeBuilderState.Constants.*;
1618

1719
/**
@@ -360,7 +362,7 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) {
360362
tb.processEndTag("li");
361363
break;
362364
}
363-
if (HtmlTreeBuilder.isSpecial(el) && !inSorted(el.normalName(), Constants.InBodyStartLiBreakers))
365+
if (isSpecial(el) && !inSorted(el.normalName(), Constants.InBodyStartLiBreakers))
364366
break;
365367
}
366368
if (tb.inButtonScope("p")) {
@@ -560,7 +562,7 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) {
560562
tb.processEndTag(el.normalName());
561563
break;
562564
}
563-
if (HtmlTreeBuilder.isSpecial(el) && !inSorted(el.normalName(), Constants.InBodyStartLiBreakers))
565+
if (isSpecial(el) && !inSorted(el.normalName(), Constants.InBodyStartLiBreakers))
564566
break;
565567
}
566568
if (tb.inButtonScope("p")) {
@@ -821,7 +823,7 @@ boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) {
821823
tb.popStackToClose(name);
822824
break;
823825
} else {
824-
if (HtmlTreeBuilder.isSpecial(node)) {
826+
if (isSpecial(node)) {
825827
tb.error(this);
826828
return false;
827829
}
@@ -830,104 +832,157 @@ boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) {
830832
return true;
831833
}
832834

833-
// Adoption Agency Algorithm.
834835
private boolean inBodyEndTagAdoption(Token t, HtmlTreeBuilder tb) {
836+
// https://html.spec.whatwg.org/multipage/parsing.html#adoption-agency-algorithm
837+
// JH: Including the spec notes here to simplify tracking / correcting. It's a bit gnarly and there may still be some nuances I haven't caught. But test cases and comparisons to browsers check out.
838+
839+
// The adoption agency algorithm, which takes as its only argument a token token for which the algorithm is being run, consists of the following steps:
835840
final Token.EndTag endTag = t.asEndTag();
836-
final String name = endTag.normalName();
841+
final String subject = endTag.normalName; // 1. Let subject be token's tag name.
837842

838-
final ArrayList<Element> stack = tb.getStack();
839-
Element el;
840-
for (int i = 0; i < 8; i++) {
841-
Element formatEl = tb.getActiveFormattingElement(name);
842-
if (formatEl == null)
843+
// 2. If the [current node] is an [HTML element] whose tag name is subject, and the [current node] is not in the [list of active formatting elements], then pop the [current node] off the [stack of open elements] and return.
844+
if (tb.currentElement().normalName().equals(subject) && !tb.isInActiveFormattingElements(tb.currentElement())) {
845+
tb.pop();
846+
return true;
847+
}
848+
int outer = 0; // 3. Let outerLoopCounter be 0.
849+
while (true) { // 4. While true:
850+
if (outer >= 8) { // 1. If outerLoopCounter is greater than or equal to 8, then return.
851+
return true;
852+
}
853+
outer++; // 2. Increment outerLoopCounter by 1.
854+
855+
// 3. Let formattingElement be the last element in the [list of active formatting elements] that:
856+
// - is between the end of the list and the last [marker] in the list, if any, or the start of the list otherwise, and
857+
// - has the tag name subject.
858+
// If there is no such element, then return and instead act as described in the "any other end tag" entry above.
859+
Element formatEl = null;
860+
for (int i = tb.formattingElements.size() - 1; i >= 0; i--) {
861+
Element next = tb.formattingElements.get(i);
862+
if (next == null) // marker
863+
break;
864+
if (next.normalName().equals(subject)) {
865+
formatEl = next;
866+
break;
867+
}
868+
}
869+
if (formatEl == null) {
843870
return anyOtherEndTag(t, tb);
844-
else if (!tb.onStack(formatEl)) {
871+
}
872+
873+
// 4. If formattingElement is not in the [stack of open elements], then this is a [parse error]; remove the element from the list, and return.
874+
if (!tb.onStack(formatEl)) {
845875
tb.error(this);
846876
tb.removeFromActiveFormattingElements(formatEl);
847877
return true;
848-
} else if (!tb.inScope(formatEl.normalName())) {
878+
}
879+
880+
// 5. If formattingElement is in the [stack of open elements], but the element is not [in scope], then this is a [parse error]; return.
881+
if (!tb.inScope(formatEl.normalName())) {
849882
tb.error(this);
850883
return false;
851-
} else if (tb.currentElement() != formatEl)
884+
} else if (tb.currentElement() != formatEl) { // 6. If formattingElement is not the [current node], this is a [parse error].
852885
tb.error(this);
886+
}
853887

888+
// 7. Let furthestBlock be the topmost node in the [stack of open elements] that is lower in the stack than formattingElement, and is an element in the [special]category. There might not be one.
854889
Element furthestBlock = null;
855-
Element commonAncestor = null;
856-
boolean seenFormattingElement = false;
857-
// the spec doesn't limit to < 64, but in degenerate cases (9000+ stack depth) this prevents run-aways
858-
final int stackSize = stack.size();
859-
int bookmark = -1;
860-
for (int si = 1; si < stackSize && si < 64; si++) {
861-
// TODO: this no longer matches the current spec at https://html.spec.whatwg.org/#adoption-agency-algorithm and should be updated
862-
el = stack.get(si);
863-
if (el == formatEl) {
864-
commonAncestor = stack.get(si - 1);
865-
seenFormattingElement = true;
866-
// Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
867-
bookmark = tb.positionOfElement(el);
868-
} else if (seenFormattingElement && HtmlTreeBuilder.isSpecial(el)) {
869-
furthestBlock = el;
870-
break;
890+
ArrayList<Element> stack = tb.getStack();
891+
int fei = stack.lastIndexOf(formatEl);
892+
if (fei != -1) { // look down the stack
893+
for (int i = fei + 1; i < stack.size(); i++) {
894+
Element el = stack.get(i);
895+
if (isSpecial(el)) {
896+
furthestBlock = el;
897+
break;
898+
}
871899
}
872900
}
901+
902+
// 8. If there is no furthestBlock, then the UA must first pop all the nodes from the bottom of the [stack of open elements], from the [current node] up to and including formattingElement, then remove formattingElement from the [list of active formatting elements], and finally return.
873903
if (furthestBlock == null) {
874-
tb.popStackToClose(formatEl.normalName());
904+
while (tb.currentElement() != formatEl) {
905+
tb.pop();
906+
}
907+
tb.pop();
875908
tb.removeFromActiveFormattingElements(formatEl);
876909
return true;
877910
}
878911

879-
Element node = furthestBlock;
912+
Element commonAncestor = tb.aboveOnStack(formatEl); // 9. Let commonAncestor be the element immediately above formattingElement in the [stack of open elements].
913+
if (commonAncestor == null) { tb.error(this); return true; } // Would be a WTF
914+
915+
// 10. Let a bookmark note the position of formattingElement in the [list of active formatting elements] relative to the elements on either side of it in the list.
916+
// JH - I think this means its index? Or do we need a linked list?
917+
int bookmark = tb.positionOfElement(formatEl);
918+
919+
Element node = furthestBlock; // 11. Let node and lastNode be furthestBlock.
880920
Element lastNode = furthestBlock;
881-
for (int j = 0; j < 3; j++) {
882-
if (tb.onStack(node))
921+
int inner = 0; // 12. Let innerLoopCounter be 0.
922+
923+
while (true) { // 13. While true:
924+
inner++; // 1. Increment innerLoopCounter by 1.
925+
// 2. Let node be the element immediately above node in the [stack of open elements], or if node is no longer in the [stack of open elements] , the element that was immediately above node in the [stack of open elements] before node was removed.
926+
if (!tb.onStack(node)) {
927+
// if node was removed from stack, use the element that was above it
928+
node = node.parent(); // JH - is there a situation where it's not the parent?
929+
} else {
883930
node = tb.aboveOnStack(node);
884-
if (!tb.isInActiveFormattingElements(node)) { // note no bookmark check
931+
}
932+
if (node == null) {
933+
tb.error(this); // shouldn't be able to hit
934+
break;
935+
}
936+
// 3. If node is formattingElement, then [break].
937+
if (node == formatEl) {
938+
break;
939+
}
940+
941+
// 4. If innerLoopCounter is greater than 3 and node is in the [list of active formatting elements], then remove node from the [list of active formatting elements].
942+
if (inner > 3 && tb.isInActiveFormattingElements(node)) {
943+
tb.removeFromActiveFormattingElements(node);
944+
break;
945+
}
946+
// 5. If node is not in the [list of active formatting elements], then remove node from the [stack of open elements] and [continue].
947+
if (!tb.isInActiveFormattingElements(node)) {
885948
tb.removeFromStack(node);
886949
continue;
887-
} else if (node == formatEl)
888-
break;
950+
}
889951

952+
// 6. [Create an element for the token] for which the element node was created, in the [HTML namespace], with commonAncestor as the intended parent; replace the entry for node in the [list of active formatting elements] with an entry for the new element, replace the entry for node in the [stack of open elements] with an entry for the new element, and let node be the new element.
890953
Element replacement = new Element(tb.tagFor(node.nodeName(), ParseSettings.preserveCase), tb.getBaseUri());
891-
// case will follow the original node (so honours ParseSettings)
892954
tb.replaceActiveFormattingElement(node, replacement);
893955
tb.replaceOnStack(node, replacement);
894956
node = replacement;
895957

958+
// 7. If lastNode is furthestBlock, then move the aforementioned bookmark to be immediately after the new node in the [list of active formatting elements].
896959
if (lastNode == furthestBlock) {
897-
// move the aforementioned bookmark to be immediately after the new node in the list of active formatting elements.
898-
// not getting how this bookmark both straddles the element above, but is inbetween here...
899960
bookmark = tb.positionOfElement(node) + 1;
900961
}
901-
if (lastNode.parent() != null)
902-
lastNode.remove();
903-
node.appendChild(lastNode);
962+
node.appendChild(lastNode); // 8. [Append] lastNode to node.
963+
lastNode = node; // 9. Set lastNode to node.
964+
} // end inner loop # 13
904965

905-
lastNode = node;
966+
// 14. Insert whatever lastNode ended up being in the previous step at the [appropriate place for inserting a node], but using commonAncestor as the _override target_.
967+
// todo - impl https://html.spec.whatwg.org/multipage/parsing.html#appropriate-place-for-inserting-a-node fostering
968+
// just use commonAncestor as target:
969+
commonAncestor.appendChild(lastNode);
970+
// 15. [Create an element for the token] for which formattingElement was created, in the [HTML namespace], with furthestBlock as the intended parent.
971+
Element adoptor = new Element(formatEl.tag(), tb.getBaseUri());
972+
adoptor.attributes().addAll(formatEl.attributes()); // also attributes
973+
// 16. Take all of the child nodes of furthestBlock and append them to the element created in the last step.
974+
for (Node child : furthestBlock.childNodes()) {
975+
adoptor.appendChild(child);
906976
}
907977

908-
if (commonAncestor != null) { // safety check, but would be an error if null
909-
if (inSorted(commonAncestor.normalName(), Constants.InBodyEndTableFosters)) {
910-
if (lastNode.parent() != null)
911-
lastNode.remove();
912-
tb.insertInFosterParent(lastNode);
913-
} else {
914-
if (lastNode.parent() != null)
915-
lastNode.remove();
916-
commonAncestor.appendChild(lastNode);
917-
}
918-
}
919-
920-
Element adopter = new Element(formatEl.tag(), tb.getBaseUri());
921-
adopter.attributes().addAll(formatEl.attributes());
922-
adopter.appendChildren(furthestBlock.childNodes());
923-
furthestBlock.appendChild(adopter);
978+
furthestBlock.appendChild(adoptor); // 17. Append that new element to furthestBlock.
979+
// 18. Remove formattingElement from the [list of active formatting elements], and insert the new element into the [list of active formatting elements] at the position of the aforementioned bookmark.
924980
tb.removeFromActiveFormattingElements(formatEl);
925-
// insert the new element into the list of active formatting elements at the position of the aforementioned bookmark.
926-
tb.pushWithBookmark(adopter, bookmark);
981+
tb.pushWithBookmark(adoptor, bookmark);
982+
// 19. Remove formattingElement from the [stack of open elements], and insert the new element into the [stack of open elements] immediately below the position of furthestBlock in that stack.
927983
tb.removeFromStack(formatEl);
928-
tb.insertOnStackAfter(furthestBlock, adopter);
929-
}
930-
return true;
984+
tb.insertOnStackAfter(furthestBlock, adoptor);
985+
} // end of outer loop # 4
931986
}
932987
},
933988
Text {

src/test/java/org/jsoup/parser/HtmlParserTest.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -673,8 +673,10 @@ private static Stream<Arguments> dupeAttributeData() {
673673
}
674674

675675
@Test public void handlesMisnestedAInDivs() {
676-
String h = "<a href='#1'><div><div><a href='#2'>child</a></div</div></a>";
677-
String w = "<a href=\"#1\"></a> <div> <a href=\"#1\"></a> <div> <a href=\"#1\"></a><a href=\"#2\">child</a> </div> </div>";
676+
String h = "<a 1><div 2><div 3><a 4>child</a></div></div></a>";
677+
String w = "<a 1></a> <div 2> <a 1=\"\"></a> <div 3> <a 1=\"\"></a><a 4>child</a> </div> </div>"; // chrome checked
678+
// todo - come back to how we copy the attributes, to keep boolean setting (not ="")
679+
678680
Document doc = Jsoup.parse(h);
679681
assertEquals(
680682
StringUtil.normaliseWhitespace(w),
@@ -1509,6 +1511,13 @@ private boolean didAddElements(String input) {
15091511
assertEquals("<a> <b> </b></a><b><div><a> </a><a>test</a></div></b>", TextUtil.stripNewlines(doc.body().html()));
15101512
}
15111513

1514+
@Test public void adoption() throws IOException {
1515+
// https://github.com/jhy/jsoup/issues/2267
1516+
File input = ParseTest.getFile("/htmltests/adopt-1.html");
1517+
Document doc = Jsoup.parse(input);
1518+
assertEquals("TEXT-AAA TEXT-BBB TEXT-CCC TEXT-DDD", doc.text());
1519+
}
1520+
15121521
@Test public void tagsMustStartWithAscii() {
15131522
// https://github.com/jhy/jsoup/issues/1006
15141523
String[] valid = {"a一", "a会员挂单金额5", "table(╯°□°)╯"};

0 commit comments

Comments
 (0)