Skip to content

Commit 78c9df9

Browse files
authored
Add tests that arrow IPC data is validated (#7096)
* Add tests for validating IPC data read/written Add tests for invalid arrays * Test with file decoder too * consolidate test * Add test for test_validation_of_invalid_primitive_array * Rework ArrayData validation to return error rather than panic * Revert "Rework ArrayData validation to return error rather than panic" This reverts commit 0b88bbc. * Revert "Add test for test_validation_of_invalid_primitive_array" This reverts commit 8d885a1.
1 parent 2bce568 commit 78c9df9

File tree

1 file changed

+167
-22
lines changed

1 file changed

+167
-22
lines changed

arrow-ipc/src/reader.rs

Lines changed: 167 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,10 +1484,11 @@ mod tests {
14841484

14851485
use super::*;
14861486

1487-
use crate::root_as_message;
1487+
use crate::convert::fb_to_schema;
1488+
use crate::{root_as_footer, root_as_message};
14881489
use arrow_array::builder::{PrimitiveRunBuilder, UnionBuilder};
14891490
use arrow_array::types::*;
1490-
use arrow_buffer::NullBuffer;
1491+
use arrow_buffer::{NullBuffer, OffsetBuffer};
14911492
use arrow_data::ArrayDataBuilder;
14921493

14931494
fn create_test_projection_schema() -> Schema {
@@ -1724,27 +1725,73 @@ mod tests {
17241725
});
17251726
}
17261727

1727-
fn roundtrip_ipc(rb: &RecordBatch) -> RecordBatch {
1728+
/// Write the record batch to an in-memory buffer in IPC File format
1729+
fn write_ipc(rb: &RecordBatch) -> Vec<u8> {
17281730
let mut buf = Vec::new();
17291731
let mut writer = crate::writer::FileWriter::try_new(&mut buf, rb.schema_ref()).unwrap();
17301732
writer.write(rb).unwrap();
17311733
writer.finish().unwrap();
1732-
drop(writer);
1734+
buf
1735+
}
17331736

1734-
let mut reader = FileReader::try_new(std::io::Cursor::new(buf), None).unwrap();
1735-
reader.next().unwrap().unwrap()
1737+
/// Return the first record batch read from the IPC File buffer
1738+
fn read_ipc(buf: &[u8]) -> Result<RecordBatch, ArrowError> {
1739+
let mut reader = FileReader::try_new(std::io::Cursor::new(buf), None)?;
1740+
reader.next().unwrap()
17361741
}
17371742

1738-
fn roundtrip_ipc_stream(rb: &RecordBatch) -> RecordBatch {
1743+
fn roundtrip_ipc(rb: &RecordBatch) -> RecordBatch {
1744+
let buf = write_ipc(rb);
1745+
read_ipc(&buf).unwrap()
1746+
}
1747+
1748+
/// Return the first record batch read from the IPC File buffer
1749+
/// using the FileDecoder API
1750+
fn read_ipc_with_decoder(buf: Vec<u8>) -> Result<RecordBatch, ArrowError> {
1751+
let buffer = Buffer::from_vec(buf);
1752+
let trailer_start = buffer.len() - 10;
1753+
let footer_len = read_footer_length(buffer[trailer_start..].try_into().unwrap())?;
1754+
let footer = root_as_footer(&buffer[trailer_start - footer_len..trailer_start])
1755+
.map_err(|e| ArrowError::InvalidArgumentError(format!("Invalid footer: {e}")))?;
1756+
1757+
let schema = fb_to_schema(footer.schema().unwrap());
1758+
1759+
let mut decoder = FileDecoder::new(Arc::new(schema), footer.version());
1760+
// Read dictionaries
1761+
for block in footer.dictionaries().iter().flatten() {
1762+
let block_len = block.bodyLength() as usize + block.metaDataLength() as usize;
1763+
let data = buffer.slice_with_length(block.offset() as _, block_len);
1764+
decoder.read_dictionary(block, &data)?
1765+
}
1766+
1767+
// Read record batch
1768+
let batches = footer.recordBatches().unwrap();
1769+
assert_eq!(batches.len(), 1); // Only wrote a single batch
1770+
1771+
let block = batches.get(0);
1772+
let block_len = block.bodyLength() as usize + block.metaDataLength() as usize;
1773+
let data = buffer.slice_with_length(block.offset() as _, block_len);
1774+
Ok(decoder.read_record_batch(block, &data)?.unwrap())
1775+
}
1776+
1777+
/// Write the record batch to an in-memory buffer in IPC Stream format
1778+
fn write_stream(rb: &RecordBatch) -> Vec<u8> {
17391779
let mut buf = Vec::new();
17401780
let mut writer = crate::writer::StreamWriter::try_new(&mut buf, rb.schema_ref()).unwrap();
17411781
writer.write(rb).unwrap();
17421782
writer.finish().unwrap();
1743-
drop(writer);
1783+
buf
1784+
}
1785+
1786+
/// Return the first record batch read from the IPC Stream buffer
1787+
fn read_stream(buf: &[u8]) -> Result<RecordBatch, ArrowError> {
1788+
let mut reader = StreamReader::try_new(std::io::Cursor::new(buf), None)?;
1789+
reader.next().unwrap()
1790+
}
17441791

1745-
let mut reader =
1746-
crate::reader::StreamReader::try_new(std::io::Cursor::new(buf), None).unwrap();
1747-
reader.next().unwrap().unwrap()
1792+
fn roundtrip_ipc_stream(rb: &RecordBatch) -> RecordBatch {
1793+
let buf = write_stream(rb);
1794+
read_stream(&buf).unwrap()
17481795
}
17491796

17501797
#[test]
@@ -2403,17 +2450,10 @@ mod tests {
24032450
.build_unchecked(),
24042451
)
24052452
};
2406-
2407-
let batch = RecordBatch::try_new(schema.clone(), vec![invalid_struct_arr]).unwrap();
2408-
2409-
let mut buf = Vec::new();
2410-
let mut writer = crate::writer::FileWriter::try_new(&mut buf, schema.as_ref()).unwrap();
2411-
writer.write(&batch).unwrap();
2412-
writer.finish().unwrap();
2413-
2414-
let mut reader = FileReader::try_new(std::io::Cursor::new(buf), None).unwrap();
2415-
let err = reader.next().unwrap().unwrap_err();
2416-
assert!(matches!(err, ArrowError::InvalidArgumentError(_)));
2453+
expect_ipc_validation_error(
2454+
Arc::new(invalid_struct_arr),
2455+
"Invalid argument error: Incorrect array length for StructArray field \"b\", expected 4 got 3",
2456+
);
24172457
}
24182458

24192459
#[test]
@@ -2472,4 +2512,109 @@ mod tests {
24722512
assert_eq!(decoded_batch.expect("Failed to read RecordBatch"), batch);
24732513
});
24742514
}
2515+
2516+
#[test]
2517+
fn test_validation_of_invalid_list_array() {
2518+
// ListArray with invalid offsets
2519+
let array = unsafe {
2520+
let values = Int32Array::from(vec![1, 2, 3]);
2521+
let bad_offsets = ScalarBuffer::<i32>::from(vec![0, 2, 4, 2]); // offsets can't go backwards
2522+
let offsets = OffsetBuffer::new_unchecked(bad_offsets); // INVALID array created
2523+
let field = Field::new_list_field(DataType::Int32, true);
2524+
let nulls = None;
2525+
ListArray::new(Arc::new(field), offsets, Arc::new(values), nulls)
2526+
};
2527+
2528+
expect_ipc_validation_error(
2529+
Arc::new(array),
2530+
"Invalid argument error: Offset invariant failure: offset at position 2 out of bounds: 4 > 2"
2531+
);
2532+
}
2533+
2534+
#[test]
2535+
fn test_validation_of_invalid_string_array() {
2536+
let valid: &[u8] = b" ";
2537+
let mut invalid = vec![];
2538+
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
2539+
invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
2540+
let binary_array = BinaryArray::from_iter(vec![None, Some(valid), None, Some(&invalid)]);
2541+
// data is not valid utf8 we can not construct a correct StringArray
2542+
// safely, so purposely create an invalid StringArray
2543+
let array = unsafe {
2544+
StringArray::new_unchecked(
2545+
binary_array.offsets().clone(),
2546+
binary_array.values().clone(),
2547+
binary_array.nulls().cloned(),
2548+
)
2549+
};
2550+
expect_ipc_validation_error(
2551+
Arc::new(array),
2552+
"Invalid argument error: Invalid UTF8 sequence at string index 3 (3..45): invalid utf-8 sequence of 1 bytes from index 38"
2553+
);
2554+
}
2555+
2556+
#[test]
2557+
fn test_validation_of_invalid_string_view_array() {
2558+
let valid: &[u8] = b" ";
2559+
let mut invalid = vec![];
2560+
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
2561+
invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
2562+
let binary_view_array =
2563+
BinaryViewArray::from_iter(vec![None, Some(valid), None, Some(&invalid)]);
2564+
// data is not valid utf8 we can not construct a correct StringArray
2565+
// safely, so purposely create an invalid StringArray
2566+
let array = unsafe {
2567+
StringViewArray::new_unchecked(
2568+
binary_view_array.views().clone(),
2569+
binary_view_array.data_buffers().to_vec(),
2570+
binary_view_array.nulls().cloned(),
2571+
)
2572+
};
2573+
expect_ipc_validation_error(
2574+
Arc::new(array),
2575+
"Invalid argument error: Encountered non-UTF-8 data at index 3: invalid utf-8 sequence of 1 bytes from index 38"
2576+
);
2577+
}
2578+
2579+
/// return an invalid dictionary array (key is larger than values)
2580+
/// ListArray with invalid offsets
2581+
#[test]
2582+
fn test_validation_of_invalid_dictionary_array() {
2583+
let array = unsafe {
2584+
let values = StringArray::from_iter_values(["a", "b", "c"]);
2585+
let keys = Int32Array::from(vec![1, 200]); // keys are not valid for values
2586+
DictionaryArray::new_unchecked(keys, Arc::new(values))
2587+
};
2588+
2589+
expect_ipc_validation_error(
2590+
Arc::new(array),
2591+
"Invalid argument error: Value at position 1 out of bounds: 200 (should be in [0, 2])",
2592+
);
2593+
}
2594+
2595+
/// Invalid Utf-8 sequence in the first character
2596+
/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
2597+
const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
2598+
2599+
/// Expect an error when reading the record batch using IPC or IPC Streams
2600+
fn expect_ipc_validation_error(array: ArrayRef, expected_err: &str) {
2601+
let rb = RecordBatch::try_from_iter([("a", array)]).unwrap();
2602+
2603+
// IPC Stream format
2604+
let buf = write_stream(&rb); // write is ok
2605+
let err = read_stream(&buf).unwrap_err();
2606+
assert_eq!(err.to_string(), expected_err);
2607+
2608+
// IPC File format
2609+
let buf = write_ipc(&rb); // write is ok
2610+
let err = read_ipc(&buf).unwrap_err();
2611+
assert_eq!(err.to_string(), expected_err);
2612+
2613+
// TODO verify there is no error when validation is disabled
2614+
// see https://github.com/apache/arrow-rs/issues/3287
2615+
2616+
// IPC Format with FileDecoder
2617+
let err = read_ipc_with_decoder(buf).unwrap_err();
2618+
assert_eq!(err.to_string(), expected_err);
2619+
}
24752620
}

0 commit comments

Comments
 (0)