You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Fixes#2267
The implementation of the adoption agency algorithm vs the HTML5 spec had drifted over time. This fixes that.
The spec for table fostering when in adoption is not presently implemented, marked as a todo.
Given the complexity of the algo and how it has changed over time, I opted to include it inline so that hopefully later changes are easier to adopt.
* Reimplemented the HTML5 Adoption Agency Algorithm to the current spec. This handles mis-nested formating / structural elements. [2278](https://github.com/jhy/jsoup/pull/2278)
// JH: Including the spec notes here to simplify tracking / correcting. It's a bit gnarly and there may still be some nuances I haven't caught. But test cases and comparisons to browsers check out.
838
+
839
+
// The adoption agency algorithm, which takes as its only argument a token token for which the algorithm is being run, consists of the following steps:
835
840
finalToken.EndTagendTag = t.asEndTag();
836
-
finalStringname = endTag.normalName();
841
+
finalStringsubject = endTag.normalName; // 1. Let subject be token's tag name.
// 2. If the [current node] is an [HTML element] whose tag name is subject, and the [current node] is not in the [list of active formatting elements], then pop the [current node] off the [stack of open elements] and return.
844
+
if (tb.currentElement().normalName().equals(subject) && !tb.isInActiveFormattingElements(tb.currentElement())) {
845
+
tb.pop();
846
+
returntrue;
847
+
}
848
+
intouter = 0; // 3. Let outerLoopCounter be 0.
849
+
while (true) { // 4. While true:
850
+
if (outer >= 8) { // 1. If outerLoopCounter is greater than or equal to 8, then return.
851
+
returntrue;
852
+
}
853
+
outer++; // 2. Increment outerLoopCounter by 1.
854
+
855
+
// 3. Let formattingElement be the last element in the [list of active formatting elements] that:
856
+
// - is between the end of the list and the last [marker] in the list, if any, or the start of the list otherwise, and
857
+
// - has the tag name subject.
858
+
// If there is no such element, then return and instead act as described in the "any other end tag" entry above.
859
+
ElementformatEl = null;
860
+
for (inti = tb.formattingElements.size() - 1; i >= 0; i--) {
861
+
Elementnext = tb.formattingElements.get(i);
862
+
if (next == null) // marker
863
+
break;
864
+
if (next.normalName().equals(subject)) {
865
+
formatEl = next;
866
+
break;
867
+
}
868
+
}
869
+
if (formatEl == null) {
843
870
returnanyOtherEndTag(t, tb);
844
-
elseif (!tb.onStack(formatEl)) {
871
+
}
872
+
873
+
// 4. If formattingElement is not in the [stack of open elements], then this is a [parse error]; remove the element from the list, and return.
874
+
if (!tb.onStack(formatEl)) {
845
875
tb.error(this);
846
876
tb.removeFromActiveFormattingElements(formatEl);
847
877
returntrue;
848
-
} elseif (!tb.inScope(formatEl.normalName())) {
878
+
}
879
+
880
+
// 5. If formattingElement is in the [stack of open elements], but the element is not [in scope], then this is a [parse error]; return.
881
+
if (!tb.inScope(formatEl.normalName())) {
849
882
tb.error(this);
850
883
returnfalse;
851
-
} elseif (tb.currentElement() != formatEl)
884
+
} elseif (tb.currentElement() != formatEl) { // 6. If formattingElement is not the [current node], this is a [parse error].
852
885
tb.error(this);
886
+
}
853
887
888
+
// 7. Let furthestBlock be the topmost node in the [stack of open elements] that is lower in the stack than formattingElement, and is an element in the [special]category. There might not be one.
854
889
ElementfurthestBlock = null;
855
-
ElementcommonAncestor = null;
856
-
booleanseenFormattingElement = false;
857
-
// the spec doesn't limit to < 64, but in degenerate cases (9000+ stack depth) this prevents run-aways
858
-
finalintstackSize = stack.size();
859
-
intbookmark = -1;
860
-
for (intsi = 1; si < stackSize && si < 64; si++) {
861
-
// TODO: this no longer matches the current spec at https://html.spec.whatwg.org/#adoption-agency-algorithm and should be updated
862
-
el = stack.get(si);
863
-
if (el == formatEl) {
864
-
commonAncestor = stack.get(si - 1);
865
-
seenFormattingElement = true;
866
-
// Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
// 8. If there is no furthestBlock, then the UA must first pop all the nodes from the bottom of the [stack of open elements], from the [current node] up to and including formattingElement, then remove formattingElement from the [list of active formatting elements], and finally return.
873
903
if (furthestBlock == null) {
874
-
tb.popStackToClose(formatEl.normalName());
904
+
while (tb.currentElement() != formatEl) {
905
+
tb.pop();
906
+
}
907
+
tb.pop();
875
908
tb.removeFromActiveFormattingElements(formatEl);
876
909
returntrue;
877
910
}
878
911
879
-
Elementnode = furthestBlock;
912
+
ElementcommonAncestor = tb.aboveOnStack(formatEl); // 9. Let commonAncestor be the element immediately above formattingElement in the [stack of open elements].
913
+
if (commonAncestor == null) { tb.error(this); returntrue; } // Would be a WTF
914
+
915
+
// 10. Let a bookmark note the position of formattingElement in the [list of active formatting elements] relative to the elements on either side of it in the list.
916
+
// JH - I think this means its index? Or do we need a linked list?
917
+
intbookmark = tb.positionOfElement(formatEl);
918
+
919
+
Elementnode = furthestBlock; // 11. Let node and lastNode be furthestBlock.
880
920
ElementlastNode = furthestBlock;
881
-
for (intj = 0; j < 3; j++) {
882
-
if (tb.onStack(node))
921
+
intinner = 0; // 12. Let innerLoopCounter be 0.
922
+
923
+
while (true) { // 13. While true:
924
+
inner++; // 1. Increment innerLoopCounter by 1.
925
+
// 2. Let node be the element immediately above node in the [stack of open elements], or if node is no longer in the [stack of open elements] , the element that was immediately above node in the [stack of open elements] before node was removed.
926
+
if (!tb.onStack(node)) {
927
+
// if node was removed from stack, use the element that was above it
928
+
node = node.parent(); // JH - is there a situation where it's not the parent?
929
+
} else {
883
930
node = tb.aboveOnStack(node);
884
-
if (!tb.isInActiveFormattingElements(node)) { // note no bookmark check
931
+
}
932
+
if (node == null) {
933
+
tb.error(this); // shouldn't be able to hit
934
+
break;
935
+
}
936
+
// 3. If node is formattingElement, then [break].
937
+
if (node == formatEl) {
938
+
break;
939
+
}
940
+
941
+
// 4. If innerLoopCounter is greater than 3 and node is in the [list of active formatting elements], then remove node from the [list of active formatting elements].
942
+
if (inner > 3 && tb.isInActiveFormattingElements(node)) {
943
+
tb.removeFromActiveFormattingElements(node);
944
+
break;
945
+
}
946
+
// 5. If node is not in the [list of active formatting elements], then remove node from the [stack of open elements] and [continue].
947
+
if (!tb.isInActiveFormattingElements(node)) {
885
948
tb.removeFromStack(node);
886
949
continue;
887
-
} elseif (node == formatEl)
888
-
break;
950
+
}
889
951
952
+
// 6. [Create an element for the token] for which the element node was created, in the [HTML namespace], with commonAncestor as the intended parent; replace the entry for node in the [list of active formatting elements] with an entry for the new element, replace the entry for node in the [stack of open elements] with an entry for the new element, and let node be the new element.
// 7. If lastNode is furthestBlock, then move the aforementioned bookmark to be immediately after the new node in the [list of active formatting elements].
896
959
if (lastNode == furthestBlock) {
897
-
// move the aforementioned bookmark to be immediately after the new node in the list of active formatting elements.
898
-
// not getting how this bookmark both straddles the element above, but is inbetween here...
899
960
bookmark = tb.positionOfElement(node) + 1;
900
961
}
901
-
if (lastNode.parent() != null)
902
-
lastNode.remove();
903
-
node.appendChild(lastNode);
962
+
node.appendChild(lastNode); // 8. [Append] lastNode to node.
963
+
lastNode = node; // 9. Set lastNode to node.
964
+
} // end inner loop # 13
904
965
905
-
lastNode = node;
966
+
// 14. Insert whatever lastNode ended up being in the previous step at the [appropriate place for inserting a node], but using commonAncestor as the _override target_.
967
+
// todo - impl https://html.spec.whatwg.org/multipage/parsing.html#appropriate-place-for-inserting-a-node fostering
968
+
// just use commonAncestor as target:
969
+
commonAncestor.appendChild(lastNode);
970
+
// 15. [Create an element for the token] for which formattingElement was created, in the [HTML namespace], with furthestBlock as the intended parent.
furthestBlock.appendChild(adoptor); // 17. Append that new element to furthestBlock.
979
+
// 18. Remove formattingElement from the [list of active formatting elements], and insert the new element into the [list of active formatting elements] at the position of the aforementioned bookmark.
924
980
tb.removeFromActiveFormattingElements(formatEl);
925
-
// insert the new element into the list of active formatting elements at the position of the aforementioned bookmark.
926
-
tb.pushWithBookmark(adopter, bookmark);
981
+
tb.pushWithBookmark(adoptor, bookmark);
982
+
// 19. Remove formattingElement from the [stack of open elements], and insert the new element into the [stack of open elements] immediately below the position of furthestBlock in that stack.
0 commit comments