Skip to content

Commit d7e886e

Browse files
committed
fix: apply an incremental threshold to all toByteArray overloads
* Extends incremental (chunked) reading to all `toByteArray` variants when the requested size is unknown or exceeds 128 KiB. * The 128 KiB threshold matches the default buffer size used in CPython. * Updates Javadoc to emphasize that memory usage grows **proportionally** with the number of bytes actually **read**, making these methods suitable for large streams when sufficient memory is available.
1 parent cbfa307 commit d7e886e

File tree

1 file changed

+77
-37
lines changed

1 file changed

+77
-37
lines changed

src/main/java/org/apache/commons/io/IOUtils.java

Lines changed: 77 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@
7373
import org.apache.commons.io.output.NullOutputStream;
7474
import org.apache.commons.io.output.NullWriter;
7575
import org.apache.commons.io.output.StringBuilderWriter;
76-
import org.apache.commons.io.output.ThresholdingOutputStream;
7776
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
7877

7978
/**
@@ -222,6 +221,21 @@ public class IOUtils {
222221
*/
223222
private static final char[] SCRATCH_CHAR_BUFFER_WO = charArray();
224223

224+
/**
225+
* The maximum size of an array in many Java VMs.
226+
*/
227+
private static final int MAX_ARRAY_LENGTH = Integer.MAX_VALUE - 8;
228+
229+
/*
230+
* Default maximum chunk size used when copying large streams into a byte array.
231+
* <p>
232+
* This value is somewhat arbitrary, currently aligned with the value used by
233+
* <a href="https://github.com/python/cpython/blob/3.14/Lib/_pyio.py">Python</a>
234+
* for copying streams.
235+
* </p>
236+
*/
237+
private static final int DEFAULT_CHUNK_SIZE = 128 * 1024;
238+
225239
/**
226240
* Returns the given InputStream if it is already a {@link BufferedInputStream}, otherwise creates a
227241
* BufferedInputStream from the given InputStream.
@@ -2640,26 +2654,34 @@ public static BufferedReader toBufferedReader(final Reader reader, final int siz
26402654
/**
26412655
* Reads all the bytes from an input stream in a byte array.
26422656
*
2643-
* @param inputStream the {@link InputStream} to read; must not be {@code null}.
2644-
* @return a new byte array.
2645-
* @throws IllegalArgumentException if the size of the stream is greater than {@code Integer.MAX_VALUE}.
2646-
* @throws IOException if an I/O error occurs while reading.
2647-
* @throws NullPointerException if {@code inputStream} is {@code null}.
2657+
* <p>The memory used by this method is <strong>proportional</strong> to the number
2658+
* of bytes read, which is only limited by {@link Integer#MAX_VALUE}. This makes it unsuitable for
2659+
* processing large input streams, unless sufficient heap space is available.</p>
2660+
*
2661+
* @param inputStream The {@link InputStream} to read; must not be {@code null}.
2662+
* @return A new byte array.
2663+
* @throws IllegalArgumentException If the size of the stream is greater than the maximum array size.
2664+
* @throws IOException If an I/O error occurs while reading.
2665+
* @throws NullPointerException If {@code inputStream} is {@code null}.
26482666
*/
26492667
public static byte[] toByteArray(final InputStream inputStream) throws IOException {
2650-
// We use a ThresholdingOutputStream to avoid reading AND writing more than Integer.MAX_VALUE.
2651-
try (UnsynchronizedByteArrayOutputStream ubaOutput = UnsynchronizedByteArrayOutputStream.builder().get();
2652-
ThresholdingOutputStream thresholdOutput = new ThresholdingOutputStream(Integer.MAX_VALUE, os -> {
2653-
throw new IllegalArgumentException(String.format("Cannot read more than %,d into a byte array", Integer.MAX_VALUE));
2654-
}, os -> ubaOutput)) {
2655-
copy(inputStream, thresholdOutput);
2656-
return ubaOutput.toByteArray();
2668+
final UnsynchronizedByteArrayOutputStream output =
2669+
copyToOutputStream(inputStream, MAX_ARRAY_LENGTH + 1, DEFAULT_CHUNK_SIZE);
2670+
if (output.size() > MAX_ARRAY_LENGTH) {
2671+
throw new IllegalArgumentException(
2672+
String.format("Cannot read more than %,d into a byte array", MAX_ARRAY_LENGTH));
26572673
}
2674+
return output.toByteArray();
26582675
}
26592676

26602677
/**
26612678
* Reads exactly {@code size} bytes from the given {@link InputStream} into a new {@code byte[]}.
26622679
*
2680+
* <p>The memory used by this method is <strong>proportional</strong> to the number
2681+
* of bytes read and limited by the specified {@code size}. This makes it suitable for
2682+
* processing large input streams, provided that <strong>sufficient</strong> heap space is
2683+
* available.</p>
2684+
*
26632685
* @param input the {@link InputStream} to read; must not be {@code null}.
26642686
* @param size the exact number of bytes to read; must be {@code >= 0}.
26652687
* @return a new byte array of length {@code size}.
@@ -2670,12 +2692,17 @@ public static byte[] toByteArray(final InputStream inputStream) throws IOExcepti
26702692
* @since 2.1
26712693
*/
26722694
public static byte[] toByteArray(final InputStream input, final int size) throws IOException {
2673-
return toByteArray(Objects.requireNonNull(input, "input")::read, size);
2695+
return toByteArray(input, size, DEFAULT_CHUNK_SIZE);
26742696
}
26752697

26762698
/**
26772699
* Reads exactly {@code size} bytes from the given {@link InputStream} into a new {@code byte[]}.
26782700
*
2701+
* <p>The memory used by this method is <strong>proportional</strong> to the number
2702+
* of bytes read and limited by the specified {@code size}. This makes it suitable for
2703+
* processing large input streams, provided that <strong>sufficient</strong> heap space is
2704+
* available.</p>
2705+
*
26792706
* @param input the {@link InputStream} to read; must not be {@code null}.
26802707
* @param size the exact number of bytes to read; must be {@code >= 0} and {@code <= Integer.MAX_VALUE}.
26812708
* @return a new byte array of length {@code size}.
@@ -2696,46 +2723,63 @@ public static byte[] toByteArray(final InputStream input, final long size) throw
26962723
/**
26972724
* Reads exactly {@code size} bytes from the given {@link InputStream} into a new {@code byte[]}.
26982725
*
2699-
* <p>When reading from an untrusted stream, this variant lowers the risk of
2700-
* {@link OutOfMemoryError} by allocating data in buffers of up to {@code bufferSize}
2701-
* bytes rather than in one large array.</p>
2726+
* <p>The memory used by this method is <strong>proportional</strong> to the number
2727+
* of bytes read and limited by the specified {@code size}. This makes it suitable for
2728+
* processing large input streams, provided that <strong>sufficient</strong> heap space is
2729+
* available.</p>
27022730
*
2703-
* <p>Note, however, that this approach requires additional temporary memory
2704-
* compared to {@link #toByteArray(InputStream, int)}.</p>
2731+
* <p>This method processes the input stream in successive chunks of up to
2732+
* {@code chunkSize} bytes.</p>
27052733
*
27062734
* @param input the {@link InputStream} to read; must not be {@code null}.
27072735
* @param size the exact number of bytes to read; must be {@code >= 0}.
27082736
* The actual bytes read are validated to equal {@code size}.
2709-
* @param bufferSize the buffer size for incremental reading; must be {@code > 0}.
2737+
* @param chunkSize The chunk size for incremental reading; must be {@code > 0}.
27102738
* @return a new byte array of length {@code size}.
2711-
* @throws IllegalArgumentException if {@code size} is negative or {@code bufferSize <= 0}.
2739+
* @throws IllegalArgumentException if {@code size} is negative or {@code chunkSize <= 0}.
27122740
* @throws EOFException if the stream ends before {@code size} bytes are read.
27132741
* @throws IOException if an I/O error occurs while reading.
27142742
* @throws NullPointerException if {@code input} is {@code null}.
27152743
* @since 2.21.0
27162744
*/
2717-
public static byte[] toByteArray(final InputStream input, final int size, final int bufferSize) throws IOException {
2745+
public static byte[] toByteArray(final InputStream input, final int size, final int chunkSize) throws IOException {
27182746
Objects.requireNonNull(input, "input");
2719-
if (bufferSize <= 0) {
2720-
throw new IllegalArgumentException("Buffer size must be greater than zero: " + bufferSize);
2747+
if (chunkSize <= 0) {
2748+
throw new IllegalArgumentException("Chunk size must be greater than zero: " + chunkSize);
27212749
}
2722-
if (size <= bufferSize) {
2750+
if (size <= chunkSize) {
27232751
// throws if size < 0
27242752
return toByteArray(input::read, size);
27252753
}
2754+
final UnsynchronizedByteArrayOutputStream output = copyToOutputStream(input, size, chunkSize);
2755+
if (output.size() != size) {
2756+
throw new EOFException("Unexpected read size, current: " + output.size() + ", expected: " + size);
2757+
}
2758+
return output.toByteArray();
2759+
}
2760+
2761+
/**
2762+
* Copies up to {@code size} bytes from the given {@link InputStream} into a new {@link UnsynchronizedByteArrayOutputStream}.
2763+
*
2764+
*
2765+
* @param input The {@link InputStream} to read; must not be {@code null}.
2766+
* @param limit The maximum number of bytes to read; must be {@code >= 0}.
2767+
* The actual bytes read are validated to equal {@code size}.
2768+
* @param bufferSize The buffer size of the output stream; must be {@code > 0}.
2769+
* @return a ByteArrayOutputStream containing the read bytes.
2770+
*/
2771+
private static UnsynchronizedByteArrayOutputStream copyToOutputStream(
2772+
final InputStream input, final long limit, final int bufferSize) throws IOException {
27262773
try (UnsynchronizedByteArrayOutputStream output = UnsynchronizedByteArrayOutputStream.builder()
27272774
.setBufferSize(bufferSize)
27282775
.get();
27292776
InputStream boundedInput = BoundedInputStream.builder()
2730-
.setMaxCount(size)
2777+
.setMaxCount(limit)
27312778
.setPropagateClose(false)
27322779
.setInputStream(input)
27332780
.get()) {
27342781
output.write(boundedInput);
2735-
if (output.size() != size) {
2736-
throw new EOFException("Unexpected read size, current: " + output.size() + ", expected: " + size);
2737-
}
2738-
return output.toByteArray();
2782+
return output;
27392783
}
27402784
}
27412785

@@ -2756,13 +2800,9 @@ static byte[] toByteArray(final IOTriFunction<byte[], Integer, Integer, Integer>
27562800
return EMPTY_BYTE_ARRAY;
27572801
}
27582802
final byte[] data = byteArray(size);
2759-
int offset = 0;
2760-
int read;
2761-
while (offset < size && (read = input.apply(data, offset, size - offset)) != EOF) {
2762-
offset += read;
2763-
}
2764-
if (offset != size) {
2765-
throw new IOException("Unexpected read size, current: " + offset + ", expected: " + size);
2803+
final int read = read(input, data, 0, size);
2804+
if (read != size) {
2805+
throw new IOException("Unexpected read size, current: " + read + ", expected: " + size);
27662806
}
27672807
return data;
27682808
}

0 commit comments

Comments
 (0)