Skip to content

Commit be7ca53

Browse files
committed
Document Cleaner and Safelist thread safety
And add a guard test Fixes #2473
1 parent 99bbdee commit be7ca53

File tree

3 files changed

+66
-6
lines changed

3 files changed

+66
-6
lines changed

src/main/java/org/jsoup/safety/Cleaner.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,24 @@
1717
import static org.jsoup.internal.SharedConstants.DummyUri;
1818

1919
/**
20-
The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
20+
The {@link Safelist}-based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
2121
that you are expecting; no junk, and no cross-site scripting attacks!
2222
<p>
23-
The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain
23+
The HTML cleaner parses the input as HTML and then runs it through a safelist, so the output HTML can only contain
2424
HTML that is allowed by the safelist.
2525
</p>
2626
<p>
2727
It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
28-
canned safe-lists only allow body contained tags.
28+
canned safelists only allow body-contained tags.
2929
</p>
3030
<p>
3131
Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
3232
</p>
33+
<p>
34+
A Cleaner may be reused across multiple documents and shared across concurrent threads once its {@link Safelist} has
35+
been configured. The cleaner uses the supplied safelist directly, so later safelist changes affect later cleaning
36+
calls. If you need a variant of an existing configuration, use {@link Safelist#Safelist(Safelist)} to make a copy.
37+
</p>
3338
*/
3439
public class Cleaner {
3540
private final Safelist safelist;

src/main/java/org/jsoup/safety/Safelist.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/
2222

2323

2424
/**
25-
Safe-lists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed.
25+
Safelists define what HTML (elements and attributes) to allow through a {@link Cleaner}. Everything else is removed.
2626
<p>
2727
Start with one of the defaults:
2828
</p>
@@ -53,15 +53,20 @@ If you need to allow more through (please be careful!), tweak a base safelist wi
5353
</ul>
5454
5555
<p>
56-
The cleaner and these safelists assume that you want to clean a <code>body</code> fragment of HTML (to add user
56+
The {@link Cleaner} and these safelists assume that you want to clean a <code>body</code> fragment of HTML (to add user
5757
supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, you could wrap
5858
the templated document HTML around the cleaned body HTML.
5959
</p>
6060
<p>
61+
Safelists are mutable. A {@link Cleaner} uses the supplied safelist directly, so later changes affect later cleaning
62+
calls. If you want to share a safelist across threads, finish configuring it first and do not mutate it while it is in
63+
use. To build a variant from an existing configuration, use {@link #Safelist(Safelist)} to make a copy.
64+
</p>
65+
<p>
6166
If you are going to extend a safelist, please be very careful. Make sure you understand what attributes may lead to
6267
XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See
6368
the <a href="https://owasp.org/www-community/xss-filter-evasion-cheatsheet">XSS Filter Evasion Cheat Sheet</a> for some
64-
XSS attack examples (that jsoup will safegaurd against the default Cleaner and Safelist configuration).
69+
XSS attack examples (that jsoup will safeguard against with the default Cleaner and Safelist configuration).
6570
</p>
6671
*/
6772
public class Safelist {

src/test/java/org/jsoup/safety/CleanerTest.java

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616

1717
import java.util.Arrays;
1818
import java.util.Locale;
19+
import java.util.concurrent.CountDownLatch;
20+
import java.util.concurrent.atomic.AtomicInteger;
21+
import java.util.concurrent.atomic.AtomicReference;
1922

2023
import static org.junit.jupiter.api.Assertions.*;
2124

@@ -226,6 +229,53 @@ public void safeListedProtocolShouldBeRetained(Locale locale) {
226229
assertFalse(new Cleaner(Safelist.none()).isValid(okDoc));
227230
}
228231

232+
@Test void configuredCleanerMayBeSharedAcrossThreads() throws InterruptedException {
233+
// https://github.com/jhy/jsoup/issues/2473
234+
String html = "<a href='/foo'>Link</a><img src='/bar' alt='Q'>";
235+
String baseUri = "https://example.com/";
236+
String expected = "<a href=\"https://example.com/foo\">Link</a><img src=\"https://example.com/bar\" alt=\"Q\">";
237+
Cleaner cleaner = new Cleaner(Safelist.basicWithImages());
238+
239+
int numThreads = 10;
240+
int numLoops = 20;
241+
String[] cleaned = new String[numThreads * numLoops];
242+
AtomicInteger next = new AtomicInteger();
243+
AtomicReference<Throwable> failure = new AtomicReference<>();
244+
CountDownLatch start = new CountDownLatch(1);
245+
CountDownLatch done = new CountDownLatch(numThreads);
246+
Thread[] threads = new Thread[numThreads];
247+
248+
for (int i = 0; i < numThreads; i++) {
249+
Thread thread = new Thread(() -> {
250+
try {
251+
start.await();
252+
for (int j = 0; j < numLoops; j++) {
253+
Document dirty = Jsoup.parseBodyFragment(html, baseUri);
254+
cleaned[next.getAndIncrement()] = cleaner.clean(dirty).body().html();
255+
}
256+
} catch (Throwable t) {
257+
failure.compareAndSet(null, t);
258+
if (t instanceof InterruptedException) Thread.currentThread().interrupt();
259+
} finally {
260+
done.countDown();
261+
}
262+
});
263+
threads[i] = thread;
264+
thread.start();
265+
}
266+
267+
start.countDown();
268+
done.await();
269+
270+
if (failure.get() != null)
271+
throw new AssertionError("Concurrent cleaner use failed", failure.get());
272+
273+
assertEquals(cleaned.length, next.get());
274+
for (String clean : cleaned) {
275+
assertEquals(expected, clean);
276+
}
277+
}
278+
229279
@Test public void resolvesRelativeLinks() {
230280
String html = "<a href='/foo'>Link</a><img src='/bar'>";
231281
String clean = Jsoup.clean(html, "http://example.com/", Safelist.basicWithImages());

0 commit comments

Comments
 (0)