Skip to content

Commit cbe934a

Browse files
committed
Isolate TagSet copies to prevent shared mutation
TagSet copies no longer pull through from the mutable template; they copy existing tags and only use the template’s fallback (the HTML defaults) for lazy lookups. This keeps parser/session tagsets independent and prevents cross-thread mutation that can lead to ConcurrentModificationException. This keeps the use of the HTML default tagset sparse in memory.
1 parent 90019cb commit cbe934a

File tree

5 files changed

+119
-9
lines changed

5 files changed

+119
-9
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
* Custom tags marked as `Tag.Void` now parse and serialize like the built-in void elements: they no longer consume following content, and the XML serializer emits the expected self-closing form. [#2425](https://github.com/jhy/jsoup/issues/2425)
3434
* The `<br>` element is once again classified as an inline tag (`Tag.isBlock() == false`), matching common developer expectations and its role as phrasing content in HTML, while pretty-printing and text extraction continue to treat it as a line break in the rendered output. [#2387](https://github.com/jhy/jsoup/issues/2387), [#2439](https://github.com/jhy/jsoup/issues/2439)
3535
* Fixed an intermittent truncation when fetching and parsing remote documents via `Jsoup.connect(url).get()`. On responses without a charset header, the initial charset sniff could sometimes (depending on buffering / `available()` behavior) be mistaken for end-of-stream and a partial parse reused, dropping trailing content. [#2448](https://github.com/jhy/jsoup/issues/2448)
36+
* TagSet copies no longer mutate their template during lazy lookups, preventing cross-thread `ConcurrentModificationException` when parsing with shared sessions.
3637

3738

3839
### Internal Changes

src/main/java/org/jsoup/parser/TagSet.java

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,24 +25,48 @@ public class TagSet {
2525
static final TagSet HtmlTagSet = initHtmlDefault();
2626

2727
private final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag
28-
private final @Nullable TagSet source; // source to pull tags from on demand
28+
private final @Nullable TagSet source; // internal fallback for lazy tag copies
2929
private @Nullable ArrayList<Consumer<Tag>> customizers; // optional onNewTag tag customizer
3030

3131
/**
3232
Returns a mutable copy of the default HTML tag set.
3333
*/
3434
public static TagSet Html() {
35-
return new TagSet(HtmlTagSet);
35+
return new TagSet(HtmlTagSet, null);
36+
}
37+
38+
private TagSet(@Nullable TagSet source, @Nullable ArrayList<Consumer<Tag>> customizers) {
39+
this.source = source;
40+
this.customizers = customizers;
3641
}
3742

3843
public TagSet() {
39-
source = null;
44+
this(null, null);
45+
}
46+
47+
/**
48+
Creates a new TagSet by copying the current tags and customizers from the provided source TagSet. Changes made to
49+
one TagSet will not affect the other.
50+
@param template the TagSet to copy
51+
*/
52+
public TagSet(TagSet template) {
53+
this(template.source, copyCustomizers(template));
54+
// copy tags eagerly; any lazy pull-through should come only from the root source (which would be the HTML defaults), not the template itself.
55+
// that way the template tagset is not mutated when we do read through
56+
if (template.tags.isEmpty()) return;
57+
58+
for (Map.Entry<String, Map<String, Tag>> namespaceEntry : template.tags.entrySet()) {
59+
Map<String, Tag> nsTags = new HashMap<>(namespaceEntry.getValue().size());
60+
for (Map.Entry<String, Tag> tagEntry : namespaceEntry.getValue().entrySet()) {
61+
nsTags.put(tagEntry.getKey(), tagEntry.getValue().clone());
62+
}
63+
tags.put(namespaceEntry.getKey(), nsTags);
64+
}
4065
}
4166

42-
public TagSet(TagSet original) {
43-
this.source = original;
44-
if (original.customizers != null)
45-
this.customizers = new ArrayList<>(original.customizers);
67+
private static @Nullable ArrayList<Consumer<Tag>> copyCustomizers(TagSet base) {
68+
if (base.customizers == null) return null;
69+
return new ArrayList<>(base.customizers);
4670
}
4771

4872
/**

src/test/java/org/jsoup/integration/SessionTest.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@
77
import org.jsoup.integration.servlets.FileServlet;
88
import org.jsoup.nodes.Document;
99
import org.jsoup.parser.Parser;
10+
import org.jsoup.parser.Tag;
11+
import org.jsoup.parser.TagSet;
1012
import org.jsoup.select.Elements;
1113
import org.junit.jupiter.api.BeforeAll;
1214
import org.junit.jupiter.api.Test;
1315

1416
import java.io.IOException;
17+
import java.lang.reflect.Field;
1518
import java.util.Map;
1619

1720
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -134,4 +137,50 @@ public void testCanChangeParsers() throws IOException {
134137
Document doc3 = session.newRequest().url(xmlUrl).get();
135138
assertEquals(xmlVal, doc3.html()); // did not blow away xml default
136139
}
140+
141+
@Test
142+
public void sessionTagSetDoesNotMutateRoot() {
143+
Connection session = Jsoup.newSession();
144+
TagSet rootTags = session.request().parser().tagSet();
145+
146+
int rootNamespacesBefore = tagSetNamespaceCount(rootTags);
147+
148+
Connection request = session.newRequest();
149+
Parser parser = request.request().parser();
150+
parser.parseInput("<custom>One <b>Two</b></custom>", "http://example.com/");
151+
152+
int rootNamespacesAfter = tagSetNamespaceCount(rootTags);
153+
assertEquals(rootNamespacesBefore, rootNamespacesAfter);
154+
}
155+
156+
@Test
157+
public void sessionTagSetCustomizerDoesNotMutateRoot() {
158+
Connection session = Jsoup.newSession();
159+
TagSet rootTags = session.request().parser().tagSet();
160+
rootTags.onNewTag(tag -> {
161+
if (!tag.isKnownTag())
162+
tag.set(Tag.RcData);
163+
});
164+
165+
int rootNamespacesBefore = tagSetNamespaceCount(rootTags);
166+
167+
Connection request = session.newRequest();
168+
Parser parser = request.request().parser();
169+
Document doc = parser.parseInput("<custom>One <b>Two</b></custom>", "https://example.com/");
170+
assertEquals(0, doc.select("custom b").size());
171+
172+
int rootNamespacesAfter = tagSetNamespaceCount(rootTags);
173+
assertEquals(rootNamespacesBefore, rootNamespacesAfter);
174+
}
175+
176+
private static int tagSetNamespaceCount(TagSet tagSet) {
177+
try {
178+
Field tagsField = TagSet.class.getDeclaredField("tags");
179+
tagsField.setAccessible(true);
180+
Map<?, ?> tags = (Map<?, ?>) tagsField.get(tagSet);
181+
return tags.size();
182+
} catch (ReflectiveOperationException e) {
183+
throw new RuntimeException(e);
184+
}
185+
}
137186
}

src/test/java/org/jsoup/parser/ParserTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,9 @@ public void testCloneCopyTagSet() {
9494
// Ensure onNewTag customizers are retained
9595
Tag custom = clone.tagSet().valueOf("qux", Parser.NamespaceHtml);
9696
assertTrue(custom.isSelfClosing());
97-
// Check that cloned tagset uses the original tag as source when original is modified
97+
// Check that cloned tagset does not observe modifications made to the original
9898
assertNull(clone.tagSet().get("bar", Parser.NamespaceHtml));
9999
parser.tagSet().add(new Tag("bar"));
100-
assertNotNull(clone.tagSet().get("bar", Parser.NamespaceHtml));
100+
assertNull(clone.tagSet().get("bar", Parser.NamespaceHtml));
101101
}
102102
}

src/test/java/org/jsoup/parser/TagSetTest.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
import org.jsoup.nodes.Element;
55
import org.junit.jupiter.api.Test;
66

7+
import java.lang.reflect.Field;
8+
import java.util.Map;
9+
import java.util.concurrent.atomic.AtomicInteger;
10+
711
import static org.jsoup.parser.Parser.NamespaceHtml;
812
import static org.junit.jupiter.api.Assertions.*;
913

@@ -182,4 +186,36 @@ public class TagSetTest {
182186
assertTrue(copy.valueOf("custom-tag", NamespaceHtml).is(Tag.Void));
183187
assertFalse(source.valueOf("custom-tag", NamespaceHtml).is(Tag.Void));
184188
}
189+
190+
@Test void copyPullThroughDoesNotMutateSource() {
191+
TagSet source = TagSet.Html();
192+
TagSet copy = new TagSet(source);
193+
194+
int sourceNamespacesBefore = tagSetNamespaceCount(source);
195+
assertNotNull(copy.get("div", NamespaceHtml));
196+
int sourceNamespacesAfter = tagSetNamespaceCount(source);
197+
assertEquals(sourceNamespacesBefore, sourceNamespacesAfter);
198+
}
199+
200+
@Test void copyPullWithCustomizerThroughDoesNotMutateSource() {
201+
TagSet source = TagSet.Html();
202+
TagSet copy = new TagSet(source);
203+
204+
AtomicInteger sourceAdds = new AtomicInteger();
205+
source.onNewTag(tag -> sourceAdds.incrementAndGet());
206+
207+
assertNotNull(copy.get("div", NamespaceHtml));
208+
assertEquals(0, sourceAdds.get());
209+
}
210+
211+
private static int tagSetNamespaceCount(TagSet tagSet) {
212+
try {
213+
Field tagsField = TagSet.class.getDeclaredField("tags");
214+
tagsField.setAccessible(true);
215+
Map<?, ?> tags = (Map<?, ?>) tagsField.get(tagSet);
216+
return tags.size();
217+
} catch (ReflectiveOperationException e) {
218+
throw new RuntimeException(e);
219+
}
220+
}
185221
}

0 commit comments

Comments
 (0)