Skip to content

Commit 19f64bd

Browse files
committed
Added TagSet#onNewTag(Consumer<Tag> customizer)
Fixes #2330
1 parent c4cdf13 commit 19f64bd

File tree

3 files changed

+118
-2
lines changed

3 files changed

+118
-2
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* Removed previously deprecated methods. [#2317](https://github.com/jhy/jsoup/pull/2317)
88

99
### Improvements
10+
* Added `TagSet#onNewTag(Consumer<Tag> customizer)`: register a callback that’s invoked for each new or cloned Tag when it’s inserted into the set. Enables dynamic tweaks of tag options (for example, marking all custom tags as self-closing, or everything in a given namespace as preserving whitespace).
1011
* Made `TokenQueue` and `CharacterReader` autocloseable, to ensure that they will release their buffers back to the buffer pool, for later reuse.
1112
* Added `Selector#evaluatorOf(String css)`, as a clearer way to obtain an Evaluator from a CSS query. An alias of `QueryParser.parse(String css)`.
1213
* Custom tags (defined via the `TagSet`) in a foreign namespace (e.g. SVG) can be configured to parse as data tags.

src/main/java/org/jsoup/parser/TagSet.java

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import org.jsoup.internal.SharedConstants;
55
import org.jspecify.annotations.Nullable;
66

7+
import java.util.ArrayList;
78
import java.util.HashMap;
89
import java.util.Map;
910
import java.util.Objects;
@@ -23,8 +24,9 @@
2324
public class TagSet {
2425
static final TagSet HtmlTagSet = initHtmlDefault();
2526

26-
final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag
27-
final @Nullable TagSet source; // source to pull tags from on demand
27+
private final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag
28+
private final @Nullable TagSet source; // source to pull tags from on demand
29+
private @Nullable ArrayList<Consumer<Tag>> customizers; // optional onNewTag tag customizer
2830

2931
/**
3032
Returns a mutable copy of the default HTML tag set.
@@ -57,6 +59,12 @@ public TagSet add(Tag tag) {
5759

5860
/** Adds the tag, but does not set defined. Used in .valueOf */
5961
private void doAdd(Tag tag) {
62+
if (customizers != null) {
63+
for (Consumer<Tag> customizer : customizers) {
64+
customizer.accept(tag);
65+
}
66+
}
67+
6068
tags.computeIfAbsent(tag.namespace, ns -> new HashMap<>())
6169
.put(tag.tagName, tag);
6270
}
@@ -149,6 +157,34 @@ public Tag valueOf(String tagName, String namespace) {
149157
return valueOf(tagName, namespace, ParseSettings.preserveCase);
150158
}
151159

160+
/**
161+
Register a callback to customize each {@link Tag} as it's added to this TagSet.
162+
<p>Customizers are invoked once per Tag, when they are added (explicitly or via the valueOf methods).</p>
163+
164+
<p>For example, to allow all unknown tags to be self-closing during when parsing as HTML:</p>
165+
<pre><code>
166+
Parser parser = Parser.htmlParser();
167+
parser.tagSet().onNewTag(tag -> {
168+
if (!tag.isKnownTag())
169+
tag.set(Tag.SelfClose);
170+
});
171+
172+
Document doc = Jsoup.parse(html, parser);
173+
</code></pre>
174+
175+
@param customizer a {@code Consumer<Tag>} that will be called for each newly added or cloned Tag; callers can
176+
inspect and modify the Tag's state (e.g. set options)
177+
@return this TagSet, to allow method chaining
178+
@since 1.21.0
179+
*/
180+
public TagSet onNewTag(Consumer<Tag> customizer) {
181+
Validate.notNull(customizer);
182+
if (customizers == null)
183+
customizers = new ArrayList<>();
184+
customizers.add(customizer);
185+
return this;
186+
}
187+
152188
@Override
153189
public boolean equals(Object o) {
154190
if (!(o instanceof TagSet)) return false;

src/test/java/org/jsoup/parser/TagSetTest.java

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import org.jsoup.nodes.Element;
55
import org.junit.jupiter.api.Test;
66

7+
import java.util.concurrent.atomic.AtomicInteger;
8+
79
import static org.jsoup.parser.Parser.NamespaceHtml;
810
import static org.junit.jupiter.api.Assertions.*;
911

@@ -105,4 +107,81 @@ public class TagSetTest {
105107
c1.clear(Tag.Known);
106108
assertFalse(c1.isKnownTag());
107109
}
110+
111+
@Test void canCustomizeAll() {
112+
TagSet tags = TagSet.Html();
113+
tags.onNewTag(tag -> tag.set(Tag.SelfClose));
114+
assertTrue(tags.get("script", NamespaceHtml).is(Tag.SelfClose));
115+
assertTrue(tags.valueOf("SCRIPT", NamespaceHtml).is(Tag.SelfClose));
116+
assertTrue(tags.valueOf("custom", NamespaceHtml).is(Tag.SelfClose));
117+
118+
Tag foo = new Tag("foo", NamespaceHtml);
119+
assertFalse(foo.is(Tag.SelfClose));
120+
tags.add(foo);
121+
assertTrue(foo.is(Tag.SelfClose));
122+
}
123+
124+
@Test void canCustomizeSome() {
125+
TagSet tags = TagSet.Html();
126+
tags.onNewTag(tag -> {
127+
if (!tag.isKnownTag()) {
128+
tag.set(Tag.SelfClose);
129+
}
130+
});
131+
assertFalse(tags.valueOf("script", NamespaceHtml).is(Tag.SelfClose));
132+
assertFalse(tags.valueOf("SCRIPT", NamespaceHtml).is(Tag.SelfClose));
133+
assertTrue(tags.valueOf("custom-tag", NamespaceHtml).is(Tag.SelfClose));
134+
}
135+
136+
@Test void canParseWithCustomization() {
137+
// really would use tag.valueOf("script"); just a test example here
138+
Parser parser = Parser.htmlParser();
139+
parser.tagSet().onNewTag(tag -> {
140+
if (tag.normalName().equals("script"))
141+
tag.set(Tag.SelfClose);
142+
});
143+
144+
Document doc = Jsoup.parse("<script />Text", parser);
145+
assertEquals("<html>\n <head>\n <script></script>\n </head>\n <body>Text</body>\n</html>", doc.html());
146+
// self closing bit still produces valid HTML
147+
}
148+
149+
@Test void canParseWithGeneralCustomization() {
150+
Parser parser = Parser.htmlParser();
151+
parser.tagSet().onNewTag(tag -> {
152+
if (!tag.isKnownTag())
153+
tag.set(Tag.SelfClose);
154+
});
155+
156+
Document doc = Jsoup.parse("<custom-data />Bar <script />Text", parser);
157+
assertEquals("<custom-data></custom-data>Bar\n<script>Text</script>", doc.body().html());
158+
}
159+
160+
@Test void supportsMultipleCustomizers() {
161+
TagSet tags = TagSet.Html();
162+
tags.onNewTag(tag -> {
163+
if (tag.normalName().equals("script"))
164+
tag.set(Tag.SelfClose);
165+
});
166+
tags.onNewTag(tag -> {
167+
if (!tag.isKnownTag())
168+
tag.set(Tag.RcData);
169+
});
170+
171+
assertTrue(tags.valueOf("script", NamespaceHtml).is(Tag.SelfClose));
172+
assertFalse(tags.valueOf("script", NamespaceHtml).is(Tag.RcData));
173+
assertTrue(tags.valueOf("custom-tag", NamespaceHtml).is(Tag.RcData));
174+
}
175+
176+
@Test void customizersArePreservedInSource() {
177+
TagSet source = TagSet.Html();
178+
source.onNewTag(tag -> tag.set(Tag.RcData));
179+
TagSet copy = new TagSet(source);
180+
assertTrue(copy.valueOf("script", NamespaceHtml).is(Tag.RcData));
181+
assertTrue(source.valueOf("script", NamespaceHtml).is(Tag.RcData));
182+
183+
copy.onNewTag(tag -> tag.set(Tag.Void));
184+
assertTrue(copy.valueOf("custom-tag", NamespaceHtml).is(Tag.Void));
185+
assertFalse(source.valueOf("custom-tag", NamespaceHtml).is(Tag.Void));
186+
}
108187
}

0 commit comments

Comments
 (0)