Skip to content

Commit 2330b08

Browse files
Add toByteArray(InputStream input, int size, int bufferSize) (#776)
* feat: Add incremental `toByteArray` method This introduces `toByteArray(InputStream input, int size, int bufferSize)`, which reads the stream in chunks of `bufferSize` instead of allocating the full array up front. By reading incrementally, the method: * Validates that the stream actually contains `size` bytes before completing the allocation. * Prevents excessive memory usage if a corrupted or malicious `size` value is provided. * Offers safer handling for untrusted input compared to the direct-allocation variant. * fix: move back positivity check to helper method * fix: changelog entry * fix: Javadoc details * fix: remove negative size check * fix: exception message * fix: restore parameter name * fix: remove details and add guidance * fix: simplify description * fix: apply an incremental threshold to all `toByteArray` overloads * Extends incremental (chunked) reading to all `toByteArray` variants when the requested size is unknown or exceeds 128 KiB. * The 128 KiB threshold matches the default buffer size used in CPython. * Updates Javadoc to emphasize that memory usage grows **proportionally** with the number of bytes actually **read**, making these methods suitable for large streams when sufficient memory is available. * fix: Javadoc of constants * fix: Formatting * fix: restore previous `toByteArray(InputStream, int)` behavior * fix: use default buffer size as chunk size * fix: possible NPE * fix: remove unrelated change * fix: `toByteArray(InputStream)` Javadoc * fix: Javadoc * Fix comment formatting for SOFT_MAX_ARRAY_LENGTH --------- Co-authored-by: Gary Gregory <[email protected]>
1 parent aebc9bc commit 2330b08

File tree

3 files changed

+158
-36
lines changed

3 files changed

+158
-36
lines changed

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ The <action> type attribute can be add,update,fix,remove.
6060
<action dev="ggregory" type="add" due-to="Gary Gregory">Add org.apache.commons.io.output.ProxyOutputStream.writeRepeat(byte[], int, int, long).</action>
6161
<action dev="ggregory" type="add" due-to="Gary Gregory">Add org.apache.commons.io.output.ProxyOutputStream.writeRepeat(byte[], long).</action>
6262
<action dev="ggregory" type="add" due-to="Gary Gregory">Add org.apache.commons.io.output.ProxyOutputStream.writeRepeat(int, long).</action>
63+
<action dev="pkarwasz" type="add" due-to="Piotr P. Karwasz">Add IOUtils.toByteArray(InputStream, int, int) for safer chunked reading with size validation.</action>
6364
<!-- UPDATE -->
6465
<action type="update" dev="ggregory" due-to="Gary Gregory, Dependabot">Bump org.apache.commons:commons-parent from 85 to 87 #774.</action>
6566
<action type="update" dev="ggregory" due-to="Gary Gregory">[test] Bump commons-codec:commons-codec from 1.18.0 to 1.19.0.</action>

src/main/java/org/apache/commons/io/IOUtils.java

Lines changed: 112 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -65,14 +65,14 @@
6565
import org.apache.commons.io.function.IOConsumer;
6666
import org.apache.commons.io.function.IOSupplier;
6767
import org.apache.commons.io.function.IOTriFunction;
68+
import org.apache.commons.io.input.BoundedInputStream;
6869
import org.apache.commons.io.input.CharSequenceReader;
6970
import org.apache.commons.io.input.QueueInputStream;
7071
import org.apache.commons.io.output.AppendableWriter;
7172
import org.apache.commons.io.output.ByteArrayOutputStream;
7273
import org.apache.commons.io.output.NullOutputStream;
7374
import org.apache.commons.io.output.NullWriter;
7475
import org.apache.commons.io.output.StringBuilderWriter;
75-
import org.apache.commons.io.output.ThresholdingOutputStream;
7676
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
7777

7878
/**
@@ -221,6 +221,14 @@ public class IOUtils {
221221
*/
222222
private static final char[] SCRATCH_CHAR_BUFFER_WO = charArray();
223223

224+
/**
225+
* The maximum size of an array in many Java VMs.
226+
* <p>
227+
* The constant is copied from OpenJDK's {@link jdk.internal.util.ArraysSupport#SOFT_MAX_ARRAY_LENGTH}.
228+
* </p>
229+
*/
230+
private static final int SOFT_MAX_ARRAY_LENGTH = Integer.MAX_VALUE - 8;
231+
224232
/**
225233
* Returns the given InputStream if it is already a {@link BufferedInputStream}, otherwise creates a
226234
* BufferedInputStream from the given InputStream.
@@ -2637,57 +2645,63 @@ public static BufferedReader toBufferedReader(final Reader reader, final int siz
26372645
}
26382646

26392647
/**
2640-
* Gets the contents of an {@link InputStream} as a {@code byte[]}.
2641-
* <p>
2642-
* This method buffers the input internally, so there is no need to use a {@link BufferedInputStream}.
2643-
* </p>
2648+
* Reads all the bytes from an input stream in a byte array.
26442649
*
2645-
* @param inputStream the {@link InputStream} to read.
2646-
* @return the requested byte array.
2647-
* @throws NullPointerException if the InputStream is {@code null}.
2648-
* @throws IOException if an I/O error occurs or reading more than {@link Integer#MAX_VALUE} occurs.
2650+
* <p>The memory used by this method is <strong>proportional</strong> to the number
2651+
* of bytes read, which is only limited by {@link Integer#MAX_VALUE}. Only streams
2652+
* which fit into a single byte array with roughly 2 GiB limit can be processed
2653+
* with this method.</p>
2654+
*
2655+
* @param inputStream The {@link InputStream} to read; must not be {@code null}.
2656+
* @return A new byte array.
2657+
* @throws IllegalArgumentException If the size of the stream is greater than the maximum array size.
2658+
* @throws IOException If an I/O error occurs while reading.
2659+
* @throws NullPointerException If {@code inputStream} is {@code null}.
26492660
*/
26502661
public static byte[] toByteArray(final InputStream inputStream) throws IOException {
2651-
// We use a ThresholdingOutputStream to avoid reading AND writing more than Integer.MAX_VALUE.
2652-
try (UnsynchronizedByteArrayOutputStream ubaOutput = UnsynchronizedByteArrayOutputStream.builder().get();
2653-
ThresholdingOutputStream thresholdOutput = new ThresholdingOutputStream(Integer.MAX_VALUE, os -> {
2654-
throw new IllegalArgumentException(String.format("Cannot read more than %,d into a byte array", Integer.MAX_VALUE));
2655-
}, os -> ubaOutput)) {
2656-
copy(inputStream, thresholdOutput);
2657-
return ubaOutput.toByteArray();
2662+
// Using SOFT_MAX_ARRAY_LENGTH guarantees that size() will not overflow
2663+
final UnsynchronizedByteArrayOutputStream output = copyToOutputStream(inputStream, SOFT_MAX_ARRAY_LENGTH + 1, DEFAULT_BUFFER_SIZE);
2664+
if (output.size() > SOFT_MAX_ARRAY_LENGTH) {
2665+
throw new IllegalArgumentException(String.format("Cannot read more than %,d into a byte array", SOFT_MAX_ARRAY_LENGTH));
26582666
}
2667+
return output.toByteArray();
26592668
}
26602669

26612670
/**
2662-
* Gets the contents of an {@link InputStream} as a {@code byte[]}. Use this method instead of
2663-
* {@link #toByteArray(InputStream)} when {@link InputStream} size is known.
2671+
* Reads exactly {@code size} bytes from the given {@link InputStream} into a new {@code byte[]}.
26642672
*
2665-
* @param input the {@link InputStream} to read.
2666-
* @param size the size of {@link InputStream} to read, where 0 &lt; {@code size} &lt;= length of input stream.
2667-
* @return byte [] of length {@code size}.
2668-
* @throws IOException if an I/O error occurs or {@link InputStream} length is smaller than parameter {@code size}.
2669-
* @throws IllegalArgumentException if {@code size} is less than zero.
2673+
* <p>This variant always allocates the whole requested array size,
2674+
* for a dynamic growing variant use {@link #toByteArray(InputStream, int, int)},
2675+
* which enforces stricter memory usage constraints.</p>
2676+
*
2677+
* @param input the {@link InputStream} to read; must not be {@code null}.
2678+
* @param size the exact number of bytes to read; must be {@code >= 0}.
2679+
* @return a new byte array of length {@code size}.
2680+
* @throws IllegalArgumentException if {@code size} is negative.
2681+
* @throws EOFException if the stream ends before {@code size} bytes are read.
2682+
* @throws IOException if an I/O error occurs while reading.
2683+
* @throws NullPointerException if {@code input} is {@code null}.
26702684
* @since 2.1
26712685
*/
2672-
@SuppressWarnings("resource")
26732686
public static byte[] toByteArray(final InputStream input, final int size) throws IOException {
26742687
return toByteArray(Objects.requireNonNull(input, "input")::read, size);
26752688
}
26762689

26772690
/**
2678-
* Gets contents of an {@link InputStream} as a {@code byte[]}.
2679-
* Use this method instead of {@link #toByteArray(InputStream)}
2680-
* when {@link InputStream} size is known.
2681-
* <strong>NOTE:</strong> the method checks that the length can safely be cast to an int without truncation
2682-
* before using {@link IOUtils#toByteArray(InputStream, int)} to read into the byte array.
2683-
* (Arrays can have no more than Integer.MAX_VALUE entries anyway.)
2691+
* Reads exactly {@code size} bytes from the given {@link InputStream} into a new {@code byte[]}.
26842692
*
2685-
* @param input the {@link InputStream} to read.
2686-
* @param size the size of {@link InputStream} to read, where 0 &lt; {@code size} &lt;= min(Integer.MAX_VALUE, length of input stream).
2687-
* @return byte [] the requested byte array, of length {@code size}.
2688-
* @throws IOException if an I/O error occurs or {@link InputStream} length is less than {@code size}.
2689-
* @throws IllegalArgumentException if size is less than zero or size is greater than Integer.MAX_VALUE.
2690-
* @see IOUtils#toByteArray(InputStream, int)
2693+
* <p>This variant always allocates the whole requested array size,
2694+
* for a dynamic growing variant use {@link #toByteArray(InputStream, int, int)},
2695+
* which enforces stricter memory usage constraints.</p>
2696+
*
2697+
* @param input the {@link InputStream} to read; must not be {@code null}.
2698+
* @param size the exact number of bytes to read; must be {@code >= 0} and {@code <= Integer.MAX_VALUE}.
2699+
* @return a new byte array of length {@code size}.
2700+
* @throws IllegalArgumentException if {@code size} is negative or does not fit into an int.
2701+
* @throws EOFException if the stream ends before {@code size} bytes are read.
2702+
* @throws IOException if an I/O error occurs while reading.
2703+
* @throws NullPointerException if {@code input} is {@code null}.
2704+
* @see #toByteArray(InputStream, int, int)
26912705
* @since 2.1
26922706
*/
26932707
public static byte[] toByteArray(final InputStream input, final long size) throws IOException {
@@ -2697,6 +2711,68 @@ public static byte[] toByteArray(final InputStream input, final long size) throw
26972711
return toByteArray(input, (int) size);
26982712
}
26992713

2714+
/**
2715+
* Reads exactly {@code size} bytes from the given {@link InputStream} into a new {@code byte[]}.
2716+
*
2717+
* <p>The memory used by this method is <strong>proportional</strong> to the number
2718+
* of bytes read and limited by the specified {@code size}. This makes it suitable for
2719+
* processing large input streams, provided that <strong>sufficient</strong> heap space is
2720+
* available.</p>
2721+
*
2722+
* <p>This method processes the input stream in successive chunks of up to
2723+
* {@code chunkSize} bytes.</p>
2724+
*
2725+
* @param input the {@link InputStream} to read; must not be {@code null}.
2726+
* @param size the exact number of bytes to read; must be {@code >= 0}.
2727+
* The actual bytes read are validated to equal {@code size}.
2728+
* @param chunkSize The chunk size for incremental reading; must be {@code > 0}.
2729+
* @return a new byte array of length {@code size}.
2730+
* @throws IllegalArgumentException if {@code size} is negative or {@code chunkSize <= 0}.
2731+
* @throws EOFException if the stream ends before {@code size} bytes are read.
2732+
* @throws IOException if an I/O error occurs while reading.
2733+
* @throws NullPointerException if {@code input} is {@code null}.
2734+
* @since 2.21.0
2735+
*/
2736+
public static byte[] toByteArray(final InputStream input, final int size, final int chunkSize) throws IOException {
2737+
Objects.requireNonNull(input, "input");
2738+
if (chunkSize <= 0) {
2739+
throw new IllegalArgumentException("Chunk size must be greater than zero: " + chunkSize);
2740+
}
2741+
if (size <= chunkSize) {
2742+
// throws if size < 0
2743+
return toByteArray(input::read, size);
2744+
}
2745+
final UnsynchronizedByteArrayOutputStream output = copyToOutputStream(input, size, chunkSize);
2746+
if (output.size() != size) {
2747+
throw new EOFException("Unexpected read size, current: " + output.size() + ", expected: " + size);
2748+
}
2749+
return output.toByteArray();
2750+
}
2751+
2752+
/**
2753+
* Copies up to {@code size} bytes from the given {@link InputStream} into a new {@link UnsynchronizedByteArrayOutputStream}.
2754+
*
2755+
* @param input The {@link InputStream} to read; must not be {@code null}.
2756+
* @param limit The maximum number of bytes to read; must be {@code >= 0}.
2757+
* The actual bytes read are validated to equal {@code size}.
2758+
* @param bufferSize The buffer size of the output stream; must be {@code > 0}.
2759+
* @return a ByteArrayOutputStream containing the read bytes.
2760+
*/
2761+
private static UnsynchronizedByteArrayOutputStream copyToOutputStream(
2762+
final InputStream input, final long limit, final int bufferSize) throws IOException {
2763+
try (UnsynchronizedByteArrayOutputStream output = UnsynchronizedByteArrayOutputStream.builder()
2764+
.setBufferSize(bufferSize)
2765+
.get();
2766+
InputStream boundedInput = BoundedInputStream.builder()
2767+
.setMaxCount(limit)
2768+
.setPropagateClose(false)
2769+
.setInputStream(input)
2770+
.get()) {
2771+
output.write(boundedInput);
2772+
return output;
2773+
}
2774+
}
2775+
27002776
/**
27012777
* Gets the contents of an input as a {@code byte[]}.
27022778
*

src/test/java/org/apache/commons/io/IOUtilsTest.java

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@
9090
import org.junit.jupiter.api.Disabled;
9191
import org.junit.jupiter.api.Test;
9292
import org.junit.jupiter.api.io.TempDir;
93+
import org.junit.jupiter.params.ParameterizedTest;
94+
import org.junit.jupiter.params.provider.Arguments;
95+
import org.junit.jupiter.params.provider.MethodSource;
9396

9497
/**
9598
* This is used to test {@link IOUtils} for correctness. The following checks are performed:
@@ -1659,6 +1662,48 @@ void testToByteArray_InputStream_SizeZero() throws Exception {
16591662
}
16601663
}
16611664

1665+
@ParameterizedTest
1666+
@MethodSource
1667+
void testToByteArray_InputStream_Size_BufferSize_Succeeds(byte[] data, int size, int bufferSize) throws IOException {
1668+
final ByteArrayInputStream input = new ByteArrayInputStream(data);
1669+
final byte[] expected = Arrays.copyOf(data, size);
1670+
final byte[] actual = IOUtils.toByteArray(input, size, bufferSize);
1671+
assertArrayEquals(expected, actual);
1672+
}
1673+
1674+
private static Stream<Arguments> testToByteArray_InputStream_Size_BufferSize_Succeeds() {
1675+
final byte[] data = new byte[1024];
1676+
for (int i = 0; i < 1024; i++) {
1677+
data[i] = (byte) i;
1678+
}
1679+
return Stream.of(
1680+
// Eager reading
1681+
Arguments.of(data.clone(), 512, 1024),
1682+
// Incremental reading
1683+
Arguments.of(data.clone(), 1024, 512),
1684+
// No reading
1685+
Arguments.of(data.clone(), 0, 128));
1686+
}
1687+
1688+
@ParameterizedTest
1689+
@MethodSource
1690+
void testToByteArray_InputStream_Size_BufferSize_Throws(
1691+
int size, int bufferSize, Class<? extends Exception> exceptionClass) throws IOException {
1692+
try (InputStream input = new NullInputStream(0)) {
1693+
assertThrows(exceptionClass, () -> IOUtils.toByteArray(input, size, bufferSize));
1694+
}
1695+
}
1696+
1697+
static Stream<Arguments> testToByteArray_InputStream_Size_BufferSize_Throws() {
1698+
return Stream.of(
1699+
// Negative size
1700+
Arguments.of(-1, 128, IllegalArgumentException.class),
1701+
// Invalid buffer size
1702+
Arguments.of(0, 0, IllegalArgumentException.class),
1703+
// Huge size: should not cause OutOfMemoryError
1704+
Arguments.of(Integer.MAX_VALUE, 128, EOFException.class));
1705+
}
1706+
16621707
@Test
16631708
void testToByteArray_Reader() throws IOException {
16641709
final String charsetName = UTF_8;

0 commit comments

Comments
 (0)