Skip to content

Commit bd92bca

Browse files
committed
Control the remaining cap correctly for reuse
Fixes jhy#2448 And refactored mark / fill to simplify control flow
1 parent b5c774d commit bd92bca

File tree

6 files changed

+233
-49
lines changed

6 files changed

+233
-49
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
* Cloning a `Parser` now preserves any custom `TagSet` applied to the parser. [#2422](https://github.com/jhy/jsoup/issues/2422), [#2423](https://github.com/jhy/jsoup/pull/2423)
3333
* Custom tags marked as `Tag.Void` now parse and serialize like the built-in void elements: they no longer consume following content, and the XML serializer emits the expected self-closing form. [#2425](https://github.com/jhy/jsoup/issues/2425)
3434
* The `<br>` element is once again classified as an inline tag (`Tag.isBlock() == false`), matching common developer expectations and its role as phrasing content in HTML, while pretty-printing and text extraction continue to treat it as a line break in the rendered output. [#2387](https://github.com/jhy/jsoup/issues/2387), [#2439](https://github.com/jhy/jsoup/issues/2439)
35+
* Fixed an intermittent truncation when fetching and parsing remote documents via `Jsoup.connect(url).get()`. On responses without a charset header, the initial charset sniff could sometimes (depending on buffering / `available()` behavior) be mistaken for end-of-stream and a partial parse reused, dropping trailing content. [#2448](https://github.com/jhy/jsoup/issues/2448)
36+
3537

3638
### Internal Changes
3739
* Deprecated internal helper `org.jsoup.internal.Functions` (for removal in v1.23.1). This was previously used to support older Android API levels without full `java.util.function` coverage; jsoup now requires core library desugaring so this indirection is no longer necessary. [#2412](https://github.com/jhy/jsoup/pull/2412)

src/main/java/org/jsoup/helper/DataUtil.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String
248248
if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
249249
int origMax = input.max();
250250
input.max(firstReadBufferSize);
251+
input.resetFullyRead(); // clear any pre-read (e.g., BOM) state before capped sniff
251252
input.mark(firstReadBufferSize);
252253
input.allowClose(false); // ignores closes during parse, in case we need to rewind
253254
try (Reader reader = new SimpleStreamReader(input, UTF_8)) { // input is currently capped to firstReadBufferSize

src/main/java/org/jsoup/internal/ControllableInputStream.java

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,19 @@
2020
// reimplemented from ConstrainableInputStream for JDK21 - extending BufferedInputStream will pin threads during read
2121
public class ControllableInputStream extends FilterInputStream {
2222
private final SimpleBufferedInput buff; // super.in, but typed as SimpleBufferedInput
23-
private int maxSize;
24-
private long startTime;
25-
private long timeout = 0; // optional max time of request
26-
private int remaining;
27-
private int markPos;
28-
private boolean interrupted;
29-
private boolean allowClose = true; // for cases where we want to re-read the input, can ignore .close() from the parser
23+
private int maxSize; // logical cap exposed to callers (0 == unlimited)
24+
private long startTime; // start time for timeout checks, nanos
25+
private long timeout = 0; // optional max time of request
26+
private int remaining; // how many bytes may still be returned to caller under the current cap
27+
private int markPos; // logical readPos snapshot for InputStream.mark/reset (not a buffer cursor)
28+
private boolean interrupted; // true if Thread.interrupted() was detected, used to latch interrupted state
29+
private boolean allowClose = true; // for cases where we want to re-read the input, can ignore .close() from the parser
3030

3131
// if we are tracking progress, will have the expected content length, progress callback, connection
3232
private @Nullable Progress<?> progress;
3333
private @Nullable Object progressContext;
34-
private int contentLength = -1;
35-
private int readPos = 0; // amount read; can be reset()
34+
private int contentLength = -1; // expected content length for progress; -1 == unknown
35+
private int readPos = 0; // amount read; can be reset()
3636

3737
private ControllableInputStream(SimpleBufferedInput in, int maxSize) {
3838
super(in);
@@ -85,6 +85,7 @@ public int read(byte[] b, int off, int len) throws IOException {
8585

8686
if (capped && len > remaining)
8787
len = remaining; // don't read more than desired, even if available
88+
buff.capRemaining(capped ? remaining : Integer.MAX_VALUE);
8889

8990
while (true) { // loop trying to read until we get some data or hit the overall timeout, if we have one
9091
if (expired())
@@ -95,7 +96,9 @@ public int read(byte[] b, int off, int len) throws IOException {
9596
if (read == -1) { // completed
9697
contentLength = readPos;
9798
} else {
98-
remaining -= read;
99+
if (capped && read > 0) {
100+
remaining -= read; // track bytes returned to the caller
101+
}
99102
readPos += read;
100103
}
101104
emitProgress();
@@ -107,6 +110,11 @@ public int read(byte[] b, int off, int len) throws IOException {
107110
}
108111
}
109112

113+
@Override
114+
public boolean markSupported() {
115+
return true;
116+
}
117+
110118
/**
111119
* Reads this inputstream to a ByteBuffer. The supplied max may be less than the inputstream's max, to support
112120
* reading just the first bytes.
@@ -145,15 +153,24 @@ public static ByteBuffer readToByteBuffer(InputStream in, int max) throws IOExce
145153

146154
@SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // not synchronized in later JDKs
147155
@Override public void reset() throws IOException {
148-
super.reset();
149-
remaining = maxSize - markPos;
156+
if (markPos < 0) throw new IOException("Resetting to invalid mark");
157+
buff.rewindToMark();
158+
buff.clearMark();
159+
if (maxSize != 0) {
160+
remaining = maxSize - markPos;
161+
buff.capRemaining(remaining);
162+
} else {
163+
remaining = 0;
164+
buff.capRemaining(Integer.MAX_VALUE);
165+
}
150166
readPos = markPos; // readPos is used for progress emits
167+
markPos = -1;
151168
}
152169

153170
@SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // not synchronized in later JDKs
154171
@Override public void mark(int readlimit) {
155-
super.mark(readlimit);
156-
markPos = maxSize - remaining;
172+
markPos = readPos;
173+
buff.setMark();
157174
}
158175

159176
/**
@@ -165,6 +182,10 @@ public boolean baseReadFully() {
165182
return buff.baseReadFully();
166183
}
167184

185+
public void resetFullyRead() {
186+
buff.resetFullyRead();
187+
}
188+
168189
/**
169190
Get the max size of this stream (how far at most will be read from the underlying stream)
170191
* @return the max size
@@ -175,7 +196,9 @@ public int max() {
175196

176197
public void max(int newMax) {
177198
remaining += newMax - maxSize; // update remaining to reflect the difference in the new maxsize
199+
if (remaining < 0) remaining = 0;
178200
maxSize = newMax;
201+
buff.capRemaining(newMax == 0 ? Integer.MAX_VALUE : remaining);
179202
}
180203

181204
public void allowClose(boolean allowClose) {

src/main/java/org/jsoup/internal/SimpleBufferedInput.java

Lines changed: 49 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,12 @@
1717
class SimpleBufferedInput extends FilterInputStream {
1818
static final int BufferSize = DefaultBufferSize;
1919
static final SoftPool<byte[]> BufferPool = new SoftPool<>(() -> new byte[BufferSize]);
20+
private int capRemaining = Integer.MAX_VALUE; // how many bytes we are allowed to pull from the underlying stream
2021

2122
private byte @Nullable [] byteBuf; // the byte buffer; recycled via SoftPool. Created in fill if required
2223
private int bufPos;
2324
private int bufLength;
24-
private int bufMark = -1;
25+
private int bufMark = -1; // mark set by ControllableInputStream; -1 when unset
2526
private boolean inReadFully = false; // true when the underlying inputstream has been read fully
2627

2728
SimpleBufferedInput(@Nullable InputStream in) {
@@ -50,12 +51,6 @@ public int read(byte[] dest, int offset, int desiredLen) throws IOException {
5051

5152
int bufAvail = bufLength - bufPos;
5253
if (bufAvail <= 0) { // can't serve from the buffer
53-
if (!inReadFully && bufMark < 0) {
54-
// skip creating / copying into a local buffer; just pass through
55-
int read = in.read(dest, offset, desiredLen);
56-
closeIfDone(read);
57-
return read;
58-
}
5954
fill();
6055
bufAvail = bufLength - bufPos;
6156
}
@@ -76,28 +71,22 @@ private void fill() throws IOException {
7671
byteBuf = BufferPool.borrow();
7772
}
7873

79-
if (bufMark < 0) { // no mark, can lose buffer (assumes we've read to bufLen)
80-
bufPos = 0;
81-
} else if (bufPos >= BufferSize) { // no room left in buffer
82-
if (bufMark > 0) { // can throw away early part of the buffer
83-
int size = bufPos - bufMark;
84-
System.arraycopy(byteBuf, bufMark, byteBuf, 0, size);
85-
bufPos = size;
86-
bufMark = 0;
87-
} else { // invalidate mark
88-
bufMark = -1;
89-
bufPos = 0;
90-
}
91-
}
74+
compact();
9275
bufLength = bufPos;
93-
int read = in.read(byteBuf, bufPos, byteBuf.length - bufPos);
76+
int toRead = Math.min(byteBuf.length - bufPos, capRemaining);
77+
if (toRead <= 0) return;
78+
int read = in.read(byteBuf, bufPos, toRead);
9479
if (read > 0) {
9580
bufLength = read + bufPos;
96-
while (byteBuf.length - bufLength > 0) { // read in more if we have space, without blocking
81+
capRemaining -= read;
82+
while (byteBuf.length - bufLength > 0 && capRemaining > 0) { // read in more if we have space, without blocking
9783
if (in.available() < 1) break;
98-
read = in.read(byteBuf, bufLength, byteBuf.length - bufLength);
84+
toRead = Math.min(byteBuf.length - bufLength, capRemaining);
85+
if (toRead <= 0) break;
86+
read = in.read(byteBuf, bufLength, toRead);
9987
if (read <= 0) break;
10088
bufLength += read;
89+
capRemaining -= read;
10190
}
10291
}
10392
closeIfDone(read);
@@ -123,30 +112,55 @@ boolean baseReadFully() {
123112
return inReadFully;
124113
}
125114

126-
@Override
127-
public int available() throws IOException {
128-
if (byteBuf != null && bufLength - bufPos > 0)
129-
return bufLength - bufPos; // doesn't include those in.available(), but mostly used as a block test
130-
return inReadFully ? 0 : in.available();
115+
void resetFullyRead() {
116+
if (in != null) // for null-wrapped streams, leave as fully read to avoid fill() on a null input
117+
inReadFully = false;
131118
}
132119

133-
@SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // explicitly not synced
134120
@Override
135-
public void mark(int readlimit) {
136-
if (readlimit > BufferSize) {
137-
throw new IllegalArgumentException("Read-ahead limit is greater than buffer size");
121+
public int available() throws IOException {
122+
int buffered = (byteBuf != null) ? (bufLength - bufPos) : 0;
123+
if (buffered > 0) {
124+
return buffered; // doesn't include those in.available(), but mostly used as a block test
138125
}
126+
int avail = inReadFully ? 0 : in.available();
127+
return avail;
128+
}
129+
130+
void capRemaining(int newRemaining) {
131+
capRemaining = Math.max(0, newRemaining);
132+
}
133+
134+
void setMark() {
139135
bufMark = bufPos;
140136
}
141137

142-
@SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // explicitly not synced
143-
@Override
144-
public void reset() throws IOException {
138+
void rewindToMark() throws IOException {
145139
if (bufMark < 0)
146140
throw new IOException("Resetting to invalid mark");
147141
bufPos = bufMark;
148142
}
149143

144+
void clearMark() {
145+
bufMark = -1;
146+
}
147+
148+
private void compact() {
149+
if (byteBuf == null || bufPos == 0) return;
150+
int keepFrom = bufMark >= 0 ? bufMark : bufPos;
151+
if (keepFrom <= 0) return;
152+
153+
int remaining = bufLength - keepFrom;
154+
if (remaining > 0) {
155+
System.arraycopy(byteBuf, keepFrom, byteBuf, 0, remaining);
156+
}
157+
bufLength = remaining;
158+
bufPos -= keepFrom;
159+
if (bufMark >= 0) {
160+
bufMark -= keepFrom;
161+
}
162+
}
163+
150164
@Override
151165
public void close() throws IOException {
152166
if (in != null) super.close();

src/test/java/org/jsoup/helper/DataUtilTest.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,4 +359,63 @@ void handlesUnlimitedRead() throws IOException {
359359
Document doc = Jsoup.parse(ParseTest.getPath("/fuzztests/2353.html.gz"));
360360
assertTrue(doc.html().contains("Read-Fully!"));
361361
}
362+
363+
@Test
364+
void charsetSniffingCanReuseTruncatedPreParse() throws IOException {
365+
// #2448: when available() reports buffered bytes after the first read, the sniffed pre-parse may be reused while capped, leading to truncation
366+
367+
StringBuilder sb = new StringBuilder();
368+
sb.append("<!doctype html><html><head><title>t</title></head><body><pre>");
369+
while (sb.length() < 6200) {
370+
sb.append("0123456789 abcdefghijklmnopqrstuvwxyz\n");
371+
}
372+
sb.append("</pre><main>list</main><hr></body></html>");
373+
String html = sb.toString();
374+
375+
376+
byte[] bytes = html.getBytes(StandardCharsets.UTF_8);
377+
ControllableInputStream in = ControllableInputStream.wrap(new BufferedOnceAvailableStream(bytes), 0);
378+
379+
DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(in, null, "http://example.com/", Parser.htmlParser());
380+
Document doc = DataUtil.parseInputStream(charsetDoc, "http://example.com/", Parser.htmlParser());
381+
382+
assertNotNull(doc.selectFirst("hr"), "hr should survive the sniff + full parse");
383+
}
384+
385+
// delivers all bytes in the first read, then signals available()>0 once to trigger a second read and baseReadFully=true
386+
static final class BufferedOnceAvailableStream extends InputStream {
387+
private final byte[] data;
388+
private int pos = 0;
389+
private boolean extraSignal = true;
390+
391+
BufferedOnceAvailableStream(byte[] data) {
392+
this.data = data;
393+
}
394+
395+
@Override
396+
public int read(byte[] b, int off, int len) {
397+
if (pos >= data.length) return -1;
398+
int take = Math.min(len, data.length - pos);
399+
System.arraycopy(data, pos, b, off, take);
400+
pos += take;
401+
return take;
402+
}
403+
404+
@Override
405+
public int read() {
406+
return pos < data.length ? (data[pos++] & 0xff) : -1;
407+
}
408+
409+
@Override
410+
public int available() {
411+
if (pos < data.length)
412+
return data.length - pos;
413+
if (extraSignal) {
414+
extraSignal = false;
415+
return 1; // nudge SimpleBufferedInput.fill() to try another read
416+
}
417+
return 0;
418+
}
419+
}
420+
362421
}

0 commit comments

Comments
 (0)