Skip to content

Commit 823709f

Browse files
committed
Don't reuse a fully read sniffed doc for StreamParser
Fixes #2483
1 parent e1b0df5 commit 823709f

File tree

5 files changed

+47
-9
lines changed

5 files changed

+47
-9
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* `Cleaner` no longer makes relative URL attributes in the input document absolute when cleaning or validating a `Document`. URL normalization now applies only to the cleaned output, and `Safelist.isSafeAttribute()` is side effect free. [#2475](https://github.com/jhy/jsoup/issues/2475)
1414
* `Cleaner` no longer duplicates enforced attributes when the input `Document` preserves attribute case. A case-variant source attribute is now replaced by the enforced attribute in the cleaned output. [#2476](https://github.com/jhy/jsoup/issues/2476)
1515
* If a per-request SOCKS proxy is configured, jsoup now avoids using the JDK `HttpClient`, because the JDK would silently ignore that proxy and attempt to connect directly. Those requests now fall back to the legacy `HttpURLConnection` transport instead, which does support SOCKS. [#2468](https://github.com/jhy/jsoup/issues/2468)
16+
* `Connection.Response.streamParser()` and `DataUtil.streamParser(Path, ...)` could fail on small inputs without a declared charset, if the initial 5 KB charset sniff fully consumed the input and closed it before the stream parse began. [#2483](https://github.com/jhy/jsoup/issues/2483)
1617

1718
## 1.22.1 (2026-Jan-01)
1819

src/main/java/org/jsoup/helper/DataUtil.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ public static StreamParser streamParser(Path path, @Nullable Charset charset, St
140140
StreamParser streamer = new StreamParser(parser);
141141
String charsetName = charset != null? charset.name() : null;
142142
try {
143-
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
143+
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharsetForStreamParser(openStream(path), charsetName, baseUri, parser);
144144
Reader reader = new SimpleStreamReader(charsetDoc.input, charsetDoc.charset);
145145
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
146146
} catch (IOException e) {
@@ -237,7 +237,18 @@ static Document parseInputStream(@Nullable ControllableInputStream input, @Nulla
237237

238238
private static final Evaluator metaCharset = Selector.evaluatorOf("meta[http-equiv=content-type], meta[charset]");
239239

240+
/** Detects charset for a regular parse, and may reuse a fully sniffed document. */
240241
static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
242+
return detectCharset(input, charsetName, baseUri, parser, true);
243+
}
244+
245+
/** Detects charset for a stream parse, and leaves the input readable for subsequent parsing. */
246+
static CharsetDoc detectCharsetForStreamParser(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
247+
return detectCharset(input, charsetName, baseUri, parser, false);
248+
}
249+
250+
/** Shared charset detection worker; regular parse can reuse a fully sniffed doc, stream parse cannot. */
251+
private static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser, boolean reuseDocIfFullyRead) throws IOException {
241252
Document doc = null;
242253
// read the start of the stream and look for a BOM or meta charset:
243254
// look for BOM - overrides any other header or input
@@ -293,7 +304,7 @@ else if (first instanceof Comment) {
293304
foundCharset = foundCharset.trim().replaceAll("[\"']", "");
294305
charsetName = foundCharset;
295306
doc = null;
296-
} else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse
307+
} else if (reuseDocIfFullyRead && input.baseReadFully()) { // keep the current parse if the caller can use a fully read doc
297308
input.close(); // the parser tried to close it
298309
} else {
299310
doc = null;

src/main/java/org/jsoup/helper/HttpConnection.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,12 +1012,11 @@ private ControllableInputStream prepareParse() {
10121012
@Override public StreamParser streamParser() throws IOException {
10131013
ControllableInputStream stream = prepareParse();
10141014
String baseUri = url.toExternalForm();
1015-
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(stream, charset, baseUri, req.parser());
1016-
// note that there may be a document in CharsetDoc as a result of scanning meta-data -- but as requires a stream parse, it is not used here. todo - revisit.
1015+
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharsetForStreamParser(stream, charset, baseUri, req.parser());
10171016

10181017
// set up the stream parser and rig this connection up to the parsed doc:
10191018
StreamParser streamer = new StreamParser(req.parser());
1020-
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, charsetDoc.charset));
1019+
BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset));
10211020
streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
10221021
streamer.document().connection(new HttpConnection(req, this));
10231022
charset = charsetDoc.charset.name();

src/test/java/org/jsoup/helper/DataUtilTest.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,14 @@ public void streamerSupportsBOMinFiles() throws IOException {
199199
assertTrue(doc.text().contains("가각갂갃간갅"));
200200
}
201201

202+
@Test
203+
public void streamParserWorksWhenCharsetDetectionFullyReadsFile() throws IOException {
204+
// small enough to be fully consumed during charset sniffing
205+
Path in = getPath("/htmltests/charset-base.html");
206+
Document doc = DataUtil.streamParser(in, null, "http://example.com", Parser.htmlParser()).complete();
207+
assertEquals("http://example.com/foo.jpg", doc.select("img").first().absUrl("src"));
208+
}
209+
202210
@Test
203211
public void supportsUTF8BOM() throws IOException {
204212
File in = getFile("/bomtests/bom_utf8.html");

src/test/java/org/jsoup/integration/ConnectTest.java

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -405,21 +405,40 @@ public void multipleParsesOkAfterReadFully() throws IOException {
405405
Connection.Response res = Jsoup.connect(echoUrl).execute().readFully();
406406

407407
Document doc = res.parse();
408-
assertTrue(doc.title().contains("Environment"));
408+
assertEquals("Webserver Environment Variables", doc.title());
409409

410410
Document doc2 = res.parse();
411-
assertTrue(doc2.title().contains("Environment"));
411+
assertEquals("Webserver Environment Variables", doc2.title());
412412
}
413413

414414
@Test
415415
public void multipleParsesOkAfterBufferUp() throws IOException {
416416
Connection.Response res = Jsoup.connect(echoUrl).execute().bufferUp();
417417

418418
Document doc = res.parse();
419-
assertTrue(doc.title().contains("Environment"));
419+
assertEquals("Webserver Environment Variables", doc.title());
420420

421421
Document doc2 = res.parse();
422-
assertTrue(doc2.title().contains("Environment"));
422+
assertEquals("Webserver Environment Variables", doc2.title());
423+
}
424+
425+
@Test
426+
public void bufferedParseWorksWhenCharsetDetectionFullyReadsResponse() throws IOException {
427+
Connection.Response res = Jsoup.connect(FileServlet.urlTo("/htmltests/charset-base.html")).execute().bufferUp();
428+
429+
Document doc = res.parse();
430+
assertEquals("UTF-8", res.charset());
431+
assertEquals("http://example.com/foo.jpg", doc.expectFirst("img").absUrl("src"));
432+
}
433+
434+
@Test
435+
public void bufferedStreamParserWorksWhenCharsetDetectionFullyReadsResponse() throws IOException {
436+
// https://github.com/jhy/jsoup/issues/2483
437+
Connection.Response res = Jsoup.connect(FileServlet.urlTo("/htmltests/charset-base.html")).execute().bufferUp();
438+
439+
Document doc = res.streamParser().complete();
440+
assertEquals("UTF-8", res.charset());
441+
assertEquals("http://example.com/foo.jpg", doc.expectFirst("img").absUrl("src"));
423442
}
424443

425444
@Test

0 commit comments

Comments
 (0)