@@ -1484,10 +1484,11 @@ mod tests {
14841484
14851485 use super :: * ;
14861486
1487- use crate :: root_as_message;
1487+ use crate :: convert:: fb_to_schema;
1488+ use crate :: { root_as_footer, root_as_message} ;
14881489 use arrow_array:: builder:: { PrimitiveRunBuilder , UnionBuilder } ;
14891490 use arrow_array:: types:: * ;
1490- use arrow_buffer:: NullBuffer ;
1491+ use arrow_buffer:: { NullBuffer , OffsetBuffer } ;
14911492 use arrow_data:: ArrayDataBuilder ;
14921493
14931494 fn create_test_projection_schema ( ) -> Schema {
@@ -1724,27 +1725,73 @@ mod tests {
17241725 } ) ;
17251726 }
17261727
1727- fn roundtrip_ipc ( rb : & RecordBatch ) -> RecordBatch {
1728+ /// Write the record batch to an in-memory buffer in IPC File format
1729+ fn write_ipc ( rb : & RecordBatch ) -> Vec < u8 > {
17281730 let mut buf = Vec :: new ( ) ;
17291731 let mut writer = crate :: writer:: FileWriter :: try_new ( & mut buf, rb. schema_ref ( ) ) . unwrap ( ) ;
17301732 writer. write ( rb) . unwrap ( ) ;
17311733 writer. finish ( ) . unwrap ( ) ;
1732- drop ( writer) ;
1734+ buf
1735+ }
17331736
1734- let mut reader = FileReader :: try_new ( std:: io:: Cursor :: new ( buf) , None ) . unwrap ( ) ;
1735- reader. next ( ) . unwrap ( ) . unwrap ( )
1737+ /// Return the first record batch read from the IPC File buffer
1738+ fn read_ipc ( buf : & [ u8 ] ) -> Result < RecordBatch , ArrowError > {
1739+ let mut reader = FileReader :: try_new ( std:: io:: Cursor :: new ( buf) , None ) ?;
1740+ reader. next ( ) . unwrap ( )
17361741 }
17371742
1738- fn roundtrip_ipc_stream ( rb : & RecordBatch ) -> RecordBatch {
1743+ fn roundtrip_ipc ( rb : & RecordBatch ) -> RecordBatch {
1744+ let buf = write_ipc ( rb) ;
1745+ read_ipc ( & buf) . unwrap ( )
1746+ }
1747+
1748+ /// Return the first record batch read from the IPC File buffer
1749+ /// using the FileDecoder API
1750+ fn read_ipc_with_decoder ( buf : Vec < u8 > ) -> Result < RecordBatch , ArrowError > {
1751+ let buffer = Buffer :: from_vec ( buf) ;
1752+ let trailer_start = buffer. len ( ) - 10 ;
1753+ let footer_len = read_footer_length ( buffer[ trailer_start..] . try_into ( ) . unwrap ( ) ) ?;
1754+ let footer = root_as_footer ( & buffer[ trailer_start - footer_len..trailer_start] )
1755+ . map_err ( |e| ArrowError :: InvalidArgumentError ( format ! ( "Invalid footer: {e}" ) ) ) ?;
1756+
1757+ let schema = fb_to_schema ( footer. schema ( ) . unwrap ( ) ) ;
1758+
1759+ let mut decoder = FileDecoder :: new ( Arc :: new ( schema) , footer. version ( ) ) ;
1760+ // Read dictionaries
1761+ for block in footer. dictionaries ( ) . iter ( ) . flatten ( ) {
1762+ let block_len = block. bodyLength ( ) as usize + block. metaDataLength ( ) as usize ;
1763+ let data = buffer. slice_with_length ( block. offset ( ) as _ , block_len) ;
1764+ decoder. read_dictionary ( block, & data) ?
1765+ }
1766+
1767+ // Read record batch
1768+ let batches = footer. recordBatches ( ) . unwrap ( ) ;
1769+ assert_eq ! ( batches. len( ) , 1 ) ; // Only wrote a single batch
1770+
1771+ let block = batches. get ( 0 ) ;
1772+ let block_len = block. bodyLength ( ) as usize + block. metaDataLength ( ) as usize ;
1773+ let data = buffer. slice_with_length ( block. offset ( ) as _ , block_len) ;
1774+ Ok ( decoder. read_record_batch ( block, & data) ?. unwrap ( ) )
1775+ }
1776+
1777+ /// Write the record batch to an in-memory buffer in IPC Stream format
1778+ fn write_stream ( rb : & RecordBatch ) -> Vec < u8 > {
17391779 let mut buf = Vec :: new ( ) ;
17401780 let mut writer = crate :: writer:: StreamWriter :: try_new ( & mut buf, rb. schema_ref ( ) ) . unwrap ( ) ;
17411781 writer. write ( rb) . unwrap ( ) ;
17421782 writer. finish ( ) . unwrap ( ) ;
1743- drop ( writer) ;
1783+ buf
1784+ }
1785+
1786+ /// Return the first record batch read from the IPC Stream buffer
1787+ fn read_stream ( buf : & [ u8 ] ) -> Result < RecordBatch , ArrowError > {
1788+ let mut reader = StreamReader :: try_new ( std:: io:: Cursor :: new ( buf) , None ) ?;
1789+ reader. next ( ) . unwrap ( )
1790+ }
17441791
1745- let mut reader =
1746- crate :: reader :: StreamReader :: try_new ( std :: io :: Cursor :: new ( buf ) , None ) . unwrap ( ) ;
1747- reader . next ( ) . unwrap ( ) . unwrap ( )
1792+ fn roundtrip_ipc_stream ( rb : & RecordBatch ) -> RecordBatch {
1793+ let buf = write_stream ( rb ) ;
1794+ read_stream ( & buf ) . unwrap ( )
17481795 }
17491796
17501797 #[ test]
@@ -2403,17 +2450,10 @@ mod tests {
24032450 . build_unchecked ( ) ,
24042451 )
24052452 } ;
2406-
2407- let batch = RecordBatch :: try_new ( schema. clone ( ) , vec ! [ invalid_struct_arr] ) . unwrap ( ) ;
2408-
2409- let mut buf = Vec :: new ( ) ;
2410- let mut writer = crate :: writer:: FileWriter :: try_new ( & mut buf, schema. as_ref ( ) ) . unwrap ( ) ;
2411- writer. write ( & batch) . unwrap ( ) ;
2412- writer. finish ( ) . unwrap ( ) ;
2413-
2414- let mut reader = FileReader :: try_new ( std:: io:: Cursor :: new ( buf) , None ) . unwrap ( ) ;
2415- let err = reader. next ( ) . unwrap ( ) . unwrap_err ( ) ;
2416- assert ! ( matches!( err, ArrowError :: InvalidArgumentError ( _) ) ) ;
2453+ expect_ipc_validation_error (
2454+ Arc :: new ( invalid_struct_arr) ,
2455+ "Invalid argument error: Incorrect array length for StructArray field \" b\" , expected 4 got 3" ,
2456+ ) ;
24172457 }
24182458
24192459 #[ test]
@@ -2472,4 +2512,109 @@ mod tests {
24722512 assert_eq ! ( decoded_batch. expect( "Failed to read RecordBatch" ) , batch) ;
24732513 } ) ;
24742514 }
2515+
2516+ #[ test]
2517+ fn test_validation_of_invalid_list_array ( ) {
2518+ // ListArray with invalid offsets
2519+ let array = unsafe {
2520+ let values = Int32Array :: from ( vec ! [ 1 , 2 , 3 ] ) ;
2521+ let bad_offsets = ScalarBuffer :: < i32 > :: from ( vec ! [ 0 , 2 , 4 , 2 ] ) ; // offsets can't go backwards
2522+ let offsets = OffsetBuffer :: new_unchecked ( bad_offsets) ; // INVALID array created
2523+ let field = Field :: new_list_field ( DataType :: Int32 , true ) ;
2524+ let nulls = None ;
2525+ ListArray :: new ( Arc :: new ( field) , offsets, Arc :: new ( values) , nulls)
2526+ } ;
2527+
2528+ expect_ipc_validation_error (
2529+ Arc :: new ( array) ,
2530+ "Invalid argument error: Offset invariant failure: offset at position 2 out of bounds: 4 > 2"
2531+ ) ;
2532+ }
2533+
2534+ #[ test]
2535+ fn test_validation_of_invalid_string_array ( ) {
2536+ let valid: & [ u8 ] = b" " ;
2537+ let mut invalid = vec ! [ ] ;
2538+ invalid. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2539+ invalid. extend_from_slice ( INVALID_UTF8_FIRST_CHAR ) ;
2540+ let binary_array = BinaryArray :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( & invalid) ] ) ;
2541+ // data is not valid utf8 we can not construct a correct StringArray
2542+ // safely, so purposely create an invalid StringArray
2543+ let array = unsafe {
2544+ StringArray :: new_unchecked (
2545+ binary_array. offsets ( ) . clone ( ) ,
2546+ binary_array. values ( ) . clone ( ) ,
2547+ binary_array. nulls ( ) . cloned ( ) ,
2548+ )
2549+ } ;
2550+ expect_ipc_validation_error (
2551+ Arc :: new ( array) ,
2552+ "Invalid argument error: Invalid UTF8 sequence at string index 3 (3..45): invalid utf-8 sequence of 1 bytes from index 38"
2553+ ) ;
2554+ }
2555+
2556+ #[ test]
2557+ fn test_validation_of_invalid_string_view_array ( ) {
2558+ let valid: & [ u8 ] = b" " ;
2559+ let mut invalid = vec ! [ ] ;
2560+ invalid. extend_from_slice ( b"ThisStringIsCertainlyLongerThan12Bytes" ) ;
2561+ invalid. extend_from_slice ( INVALID_UTF8_FIRST_CHAR ) ;
2562+ let binary_view_array =
2563+ BinaryViewArray :: from_iter ( vec ! [ None , Some ( valid) , None , Some ( & invalid) ] ) ;
2564+ // data is not valid utf8 we can not construct a correct StringArray
2565+ // safely, so purposely create an invalid StringArray
2566+ let array = unsafe {
2567+ StringViewArray :: new_unchecked (
2568+ binary_view_array. views ( ) . clone ( ) ,
2569+ binary_view_array. data_buffers ( ) . to_vec ( ) ,
2570+ binary_view_array. nulls ( ) . cloned ( ) ,
2571+ )
2572+ } ;
2573+ expect_ipc_validation_error (
2574+ Arc :: new ( array) ,
2575+ "Invalid argument error: Encountered non-UTF-8 data at index 3: invalid utf-8 sequence of 1 bytes from index 38"
2576+ ) ;
2577+ }
2578+
2579+ /// return an invalid dictionary array (key is larger than values)
2580+ /// ListArray with invalid offsets
2581+ #[ test]
2582+ fn test_validation_of_invalid_dictionary_array ( ) {
2583+ let array = unsafe {
2584+ let values = StringArray :: from_iter_values ( [ "a" , "b" , "c" ] ) ;
2585+ let keys = Int32Array :: from ( vec ! [ 1 , 200 ] ) ; // keys are not valid for values
2586+ DictionaryArray :: new_unchecked ( keys, Arc :: new ( values) )
2587+ } ;
2588+
2589+ expect_ipc_validation_error (
2590+ Arc :: new ( array) ,
2591+ "Invalid argument error: Value at position 1 out of bounds: 200 (should be in [0, 2])" ,
2592+ ) ;
2593+ }
2594+
2595+ /// Invalid Utf-8 sequence in the first character
2596+ /// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
2597+ const INVALID_UTF8_FIRST_CHAR : & [ u8 ] = & [ 0xa0 , 0xa1 , 0x20 , 0x20 ] ;
2598+
2599+ /// Expect an error when reading the record batch using IPC or IPC Streams
2600+ fn expect_ipc_validation_error ( array : ArrayRef , expected_err : & str ) {
2601+ let rb = RecordBatch :: try_from_iter ( [ ( "a" , array) ] ) . unwrap ( ) ;
2602+
2603+ // IPC Stream format
2604+ let buf = write_stream ( & rb) ; // write is ok
2605+ let err = read_stream ( & buf) . unwrap_err ( ) ;
2606+ assert_eq ! ( err. to_string( ) , expected_err) ;
2607+
2608+ // IPC File format
2609+ let buf = write_ipc ( & rb) ; // write is ok
2610+ let err = read_ipc ( & buf) . unwrap_err ( ) ;
2611+ assert_eq ! ( err. to_string( ) , expected_err) ;
2612+
2613+ // TODO verify there is no error when validation is disabled
2614+ // see https://github.com/apache/arrow-rs/issues/3287
2615+
2616+ // IPC Format with FileDecoder
2617+ let err = read_ipc_with_decoder ( buf) . unwrap_err ( ) ;
2618+ assert_eq ! ( err. to_string( ) , expected_err) ;
2619+ }
24752620}
0 commit comments