@@ -22,7 +22,7 @@ use std::ops::Range;
2222use std:: str:: FromStr ;
2323use std:: sync:: Arc ;
2424
25- use arrow_arith:: boolean:: { and, is_not_null, is_null, not, or} ;
25+ use arrow_arith:: boolean:: { and, and_kleene , is_not_null, is_null, not, or, or_kleene } ;
2626use arrow_array:: { Array , ArrayRef , BooleanArray , RecordBatch } ;
2727use arrow_ord:: cmp:: { eq, gt, gt_eq, lt, lt_eq, neq} ;
2828use arrow_schema:: {
@@ -827,7 +827,7 @@ impl<'a> BoundPredicateVisitor for PredicateConverter<'a> {
827827 Ok ( Box :: new ( move |batch| {
828828 let left = lhs ( batch. clone ( ) ) ?;
829829 let right = rhs ( batch) ?;
830- and ( & left, & right)
830+ and_kleene ( & left, & right)
831831 } ) )
832832 }
833833
@@ -839,7 +839,7 @@ impl<'a> BoundPredicateVisitor for PredicateConverter<'a> {
839839 Ok ( Box :: new ( move |batch| {
840840 let left = lhs ( batch. clone ( ) ) ?;
841841 let right = rhs ( batch) ?;
842- or ( & left, & right)
842+ or_kleene ( & left, & right)
843843 } ) )
844844 }
845845
@@ -1165,18 +1165,29 @@ impl<R: FileRead> AsyncFileReader for ArrowFileReader<R> {
11651165#[ cfg( test) ]
11661166mod tests {
11671167 use std:: collections:: { HashMap , HashSet } ;
1168+ use std:: fs:: File ;
11681169 use std:: sync:: Arc ;
11691170
1171+ use arrow_array:: cast:: AsArray ;
1172+ use arrow_array:: { ArrayRef , RecordBatch , StringArray } ;
11701173 use arrow_schema:: { DataType , Field , Schema as ArrowSchema , TimeUnit } ;
1171- use parquet:: arrow:: ProjectionMask ;
1174+ use futures:: TryStreamExt ;
1175+ use parquet:: arrow:: { ArrowWriter , ProjectionMask } ;
1176+ use parquet:: basic:: Compression ;
1177+ use parquet:: file:: properties:: WriterProperties ;
11721178 use parquet:: schema:: parser:: parse_message_type;
11731179 use parquet:: schema:: types:: SchemaDescriptor ;
1180+ use tempfile:: TempDir ;
11741181
11751182 use crate :: arrow:: reader:: { CollectFieldIdVisitor , PARQUET_FIELD_ID_META_KEY } ;
1176- use crate :: arrow:: ArrowReader ;
1183+ use crate :: arrow:: { ArrowReader , ArrowReaderBuilder } ;
11771184 use crate :: expr:: visitors:: bound_predicate_visitor:: visit;
1178- use crate :: expr:: { Bind , Reference } ;
1179- use crate :: spec:: { NestedField , PrimitiveType , Schema , SchemaRef , Type } ;
1185+ use crate :: expr:: { Bind , Predicate , Reference } ;
1186+ use crate :: io:: FileIO ;
1187+ use crate :: scan:: { FileScanTask , FileScanTaskStream } ;
1188+ use crate :: spec:: {
1189+ DataContentType , DataFileFormat , Datum , NestedField , PrimitiveType , Schema , SchemaRef , Type ,
1190+ } ;
11801191 use crate :: ErrorKind ;
11811192
11821193 fn table_schema_simple ( ) -> SchemaRef {
@@ -1336,4 +1347,138 @@ message schema {
13361347 . expect ( "Some ProjectionMask" ) ;
13371348 assert_eq ! ( mask, ProjectionMask :: leaves( & parquet_schema, vec![ 0 ] ) ) ;
13381349 }
1350+
1351+ #[ tokio:: test]
1352+ async fn test_kleene_logic_or_behaviour ( ) {
1353+ // a IS NULL OR a = 'foo'
1354+ let predicate = Reference :: new ( "a" )
1355+ . is_null ( )
1356+ . or ( Reference :: new ( "a" ) . equal_to ( Datum :: string ( "foo" ) ) ) ;
1357+
1358+ // Table data: [NULL, "foo", "bar"]
1359+ let data_for_col_a = vec ! [ None , Some ( "foo" . to_string( ) ) , Some ( "bar" . to_string( ) ) ] ;
1360+
1361+ // Expected: [NULL, "foo"].
1362+ let expected = vec ! [ None , Some ( "foo" . to_string( ) ) ] ;
1363+
1364+ let ( file_io, schema, table_location, _temp_dir) = setup_kleene_logic ( data_for_col_a) ;
1365+ let reader = ArrowReaderBuilder :: new ( file_io) . build ( ) ;
1366+
1367+ let result_data = test_perform_read ( predicate, schema, table_location, reader) . await ;
1368+
1369+ assert_eq ! ( result_data, expected) ;
1370+ }
1371+
1372+ #[ tokio:: test]
1373+ async fn test_kleene_logic_and_behaviour ( ) {
1374+ // a IS NOT NULL AND a != 'foo'
1375+ let predicate = Reference :: new ( "a" )
1376+ . is_not_null ( )
1377+ . and ( Reference :: new ( "a" ) . not_equal_to ( Datum :: string ( "foo" ) ) ) ;
1378+
1379+ // Table data: [NULL, "foo", "bar"]
1380+ let data_for_col_a = vec ! [ None , Some ( "foo" . to_string( ) ) , Some ( "bar" . to_string( ) ) ] ;
1381+
1382+ // Expected: ["bar"].
1383+ let expected = vec ! [ Some ( "bar" . to_string( ) ) ] ;
1384+
1385+ let ( file_io, schema, table_location, _temp_dir) = setup_kleene_logic ( data_for_col_a) ;
1386+ let reader = ArrowReaderBuilder :: new ( file_io) . build ( ) ;
1387+
1388+ let result_data = test_perform_read ( predicate, schema, table_location, reader) . await ;
1389+
1390+ assert_eq ! ( result_data, expected) ;
1391+ }
1392+
1393+ async fn test_perform_read (
1394+ predicate : Predicate ,
1395+ schema : SchemaRef ,
1396+ table_location : String ,
1397+ reader : ArrowReader ,
1398+ ) -> Vec < Option < String > > {
1399+ let tasks = Box :: pin ( futures:: stream:: iter (
1400+ vec ! [ Ok ( FileScanTask {
1401+ start: 0 ,
1402+ length: 0 ,
1403+ record_count: None ,
1404+ data_file_path: format!( "{}/1.parquet" , table_location) ,
1405+ data_file_content: DataContentType :: Data ,
1406+ data_file_format: DataFileFormat :: Parquet ,
1407+ schema: schema. clone( ) ,
1408+ project_field_ids: vec![ 1 ] ,
1409+ predicate: Some ( predicate. bind( schema, true ) . unwrap( ) ) ,
1410+ deletes: vec![ ] ,
1411+ } ) ]
1412+ . into_iter ( ) ,
1413+ ) ) as FileScanTaskStream ;
1414+
1415+ let result = reader
1416+ . read ( tasks)
1417+ . await
1418+ . unwrap ( )
1419+ . try_collect :: < Vec < RecordBatch > > ( )
1420+ . await
1421+ . unwrap ( ) ;
1422+
1423+ let result_data = result[ 0 ] . columns ( ) [ 0 ]
1424+ . as_string_opt :: < i32 > ( )
1425+ . unwrap ( )
1426+ . iter ( )
1427+ . map ( |v| v. map ( ToOwned :: to_owned) )
1428+ . collect :: < Vec < _ > > ( ) ;
1429+
1430+ result_data
1431+ }
1432+
1433+ fn setup_kleene_logic (
1434+ data_for_col_a : Vec < Option < String > > ,
1435+ ) -> ( FileIO , SchemaRef , String , TempDir ) {
1436+ let schema = Arc :: new (
1437+ Schema :: builder ( )
1438+ . with_schema_id ( 1 )
1439+ . with_fields ( vec ! [ NestedField :: optional(
1440+ 1 ,
1441+ "a" ,
1442+ Type :: Primitive ( PrimitiveType :: String ) ,
1443+ )
1444+ . into( ) ] )
1445+ . build ( )
1446+ . unwrap ( ) ,
1447+ ) ;
1448+
1449+ let arrow_schema = Arc :: new ( ArrowSchema :: new ( vec ! [ Field :: new(
1450+ "a" ,
1451+ DataType :: Utf8 ,
1452+ true ,
1453+ )
1454+ . with_metadata( HashMap :: from( [ (
1455+ PARQUET_FIELD_ID_META_KEY . to_string( ) ,
1456+ "1" . to_string( ) ,
1457+ ) ] ) ) ] ) ) ;
1458+
1459+ let tmp_dir = TempDir :: new ( ) . unwrap ( ) ;
1460+ let table_location = tmp_dir. path ( ) . to_str ( ) . unwrap ( ) . to_string ( ) ;
1461+
1462+ let file_io = FileIO :: from_path ( & table_location) . unwrap ( ) . build ( ) . unwrap ( ) ;
1463+
1464+ let col = Arc :: new ( StringArray :: from ( data_for_col_a) ) as ArrayRef ;
1465+
1466+ let to_write = RecordBatch :: try_new ( arrow_schema. clone ( ) , vec ! [ col] ) . unwrap ( ) ;
1467+
1468+ // Write the Parquet files
1469+ let props = WriterProperties :: builder ( )
1470+ . set_compression ( Compression :: SNAPPY )
1471+ . build ( ) ;
1472+
1473+ let file = File :: create ( format ! ( "{}/1.parquet" , & table_location) ) . unwrap ( ) ;
1474+ let mut writer =
1475+ ArrowWriter :: try_new ( file, to_write. schema ( ) , Some ( props. clone ( ) ) ) . unwrap ( ) ;
1476+
1477+ writer. write ( & to_write) . expect ( "Writing batch" ) ;
1478+
1479+ // writer must be closed to write footer
1480+ writer. close ( ) . unwrap ( ) ;
1481+
1482+ ( file_io, schema, table_location, tmp_dir)
1483+ }
13391484}
0 commit comments