diff --git a/Cargo.lock b/Cargo.lock index 508bc4bb84c..782b2b643de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6715,6 +6715,7 @@ dependencies = [ name = "vortex-expr" version = "0.1.0" dependencies = [ + "arbitrary", "dyn-hash", "itertools 0.14.0", "prost", @@ -6859,6 +6860,7 @@ dependencies = [ "vortex-buffer", "vortex-dtype", "vortex-error", + "vortex-expr", "vortex-file", "vortex-mask", "vortex-scalar", diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 5d46212b224..ae73c96aa60 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -30,6 +30,7 @@ vortex-btrblocks = { workspace = true } vortex-buffer = { workspace = true } vortex-dtype = { workspace = true, features = ["arbitrary"] } vortex-error = { workspace = true } +vortex-expr = { workspace = true, features = ["arbitrary"] } vortex-file = { workspace = true } vortex-mask = { workspace = true } vortex-scalar = { workspace = true, features = ["arbitrary"] } diff --git a/fuzz/fuzz_targets/file_io.rs b/fuzz/fuzz_targets/file_io.rs index 1f7580ee47b..405bd94504b 100644 --- a/fuzz/fuzz_targets/file_io.rs +++ b/fuzz/fuzz_targets/file_io.rs @@ -6,19 +6,25 @@ use arrow_ord::sort::SortOptions; use futures_util::TryStreamExt; use libfuzzer_sys::{Corpus, fuzz_target}; use vortex_array::arrays::ChunkedArray; -use vortex_array::arrays::arbitrary::ArbitraryArray; use vortex_array::arrow::IntoArrowArray; use vortex_array::compute::{Operator, compare}; use vortex_array::{Array, ArrayRef, Canonical, IntoArray, ToCanonical}; use vortex_buffer::ByteBufferMut; use vortex_dtype::{DType, StructFields}; use vortex_error::{VortexExpect, VortexUnwrap, vortex_panic}; +use vortex_expr::root; use vortex_file::{VortexOpenOptions, VortexWriteOptions}; +use vortex_fuzz::FuzzFileAction; use vortex_utils::aliases::DefaultHashBuilder; use vortex_utils::aliases::hash_set::HashSet; -fuzz_target!(|array_data: ArbitraryArray| -> Corpus { - let array_data = array_data.0; +fuzz_target!(|fuzz: FuzzFileAction| -> Corpus { + let FuzzFileAction { + array, + projection, + filter, + } = fuzz; + let array_data = array; if has_nullable_struct(array_data.dtype()) || has_duplicate_field_names(array_data.dtype()) { return Corpus::Reject; @@ -41,6 +47,8 @@ fuzz_target!(|array_data: ArbitraryArray| -> Corpus { .vortex_unwrap() .scan() .vortex_unwrap() + .with_projection(projection.unwrap_or_else(|| root())) + .with_some_filter(filter) .into_array_stream() .vortex_unwrap() .try_collect::>() diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs index 7120d2eaadd..a86307a872c 100644 --- a/fuzz/src/lib.rs +++ b/fuzz/src/lib.rs @@ -25,6 +25,8 @@ use vortex_array::{Array, ArrayRef, IntoArray}; use vortex_btrblocks::BtrBlocksCompressor; use vortex_dtype::{DType, Nullability}; use vortex_error::{VortexExpect, VortexUnwrap, vortex_panic}; +use vortex_expr::ExprRef; +use vortex_expr::arbitrary::{filter_expr, projection_expr}; use vortex_mask::Mask; use vortex_scalar::Scalar; use vortex_scalar::arbitrary::random_scalar; @@ -253,3 +255,22 @@ fn actions_for_dtype(dtype: &DType) -> HashSet { _ => ALL_ACTIONS.collect(), } } + +#[derive(Debug)] +pub struct FuzzFileAction { + pub array: ArrayRef, + pub projection: Option, + pub filter: Option, +} + +impl<'a> Arbitrary<'a> for FuzzFileAction { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let array = ArbitraryArray::arbitrary(u)?.0; + let dtype = array.dtype().clone(); + Ok(FuzzFileAction { + array, + projection: projection_expr(u, &dtype)?, + filter: filter_expr(u, &dtype)?, + }) + } +} diff --git a/vortex-expr/Cargo.toml b/vortex-expr/Cargo.toml index 04980404a77..de3c62ecb77 100644 --- a/vortex-expr/Cargo.toml +++ b/vortex-expr/Cargo.toml @@ -17,6 +17,7 @@ version = { workspace = true } workspace = true [dependencies] +arbitrary = { workspace = true, optional = true } dyn-hash = { workspace = true } itertools = { workspace = true } prost = { workspace = true, optional = true } @@ -34,6 +35,11 @@ vortex-utils = { workspace = true } vortex-expr = { path = ".", features = ["test-harness"] } [features] +arbitrary = [ + "dep:arbitrary", + "vortex-scalar/arbitrary", + "vortex-dtype/arbitrary", +] serde = ["dep:serde", "vortex-dtype/serde", "vortex-error/serde"] proto = ["vortex-proto/expr", "vortex-error/prost", "dep:prost", "serde"] test-harness = [] diff --git a/vortex-expr/src/arbitrary.rs b/vortex-expr/src/arbitrary.rs new file mode 100644 index 00000000000..4e6487764fb --- /dev/null +++ b/vortex-expr/src/arbitrary.rs @@ -0,0 +1,63 @@ +use std::cmp::max; + +use arbitrary::{Result as AResult, Unstructured}; +use vortex_dtype::{DType, FieldName}; +use vortex_scalar::arbitrary::random_scalar; + +use crate::{BinaryExpr, ExprRef, Operator, and_collect, get_item_scope, lit, pack}; + +pub fn projection_expr(u: &mut Unstructured<'_>, dtype: &DType) -> AResult> { + let Some(struct_dtype) = dtype.as_struct() else { + return Ok(None); + }; + + let column_count = u.int_in_range::(0..=max(struct_dtype.nfields(), 10))?; + + let cols = (0..column_count) + .map(|_| { + let get_item = u.choose(struct_dtype.names().iter().as_slice())?; + Ok((get_item.clone(), get_item_scope(get_item.clone()))) + }) + .collect::>>()?; + + Ok(Some(pack(cols, u.arbitrary()?))) +} + +pub fn filter_expr(u: &mut Unstructured<'_>, dtype: &DType) -> AResult> { + let Some(struct_dtype) = dtype.as_struct() else { + return Ok(None); + }; + + let filter_count = u.int_in_range::(0..=max(struct_dtype.nfields(), 10))?; + + let filters = (0..filter_count) + .map(|_| { + let (col, dtype) = + u.choose_iter(struct_dtype.names().iter().zip(struct_dtype.fields()))?; + random_comparison(u, col, &dtype) + }) + .collect::>>()?; + + Ok(and_collect(filters)) +} + +fn random_comparison(u: &mut Unstructured<'_>, col: &FieldName, dtype: &DType) -> AResult { + let scalar = random_scalar(u, dtype)?; + Ok(BinaryExpr::new_expr( + get_item_scope(col.clone()), + arbitrary_comparison_operator(u)?, + lit(scalar), + )) +} + +fn arbitrary_comparison_operator(u: &mut Unstructured<'_>) -> AResult { + Ok(match u.int_in_range(0..=5)? { + 0 => Operator::Eq, + 1 => Operator::NotEq, + 2 => Operator::Gt, + 3 => Operator::Gte, + 4 => Operator::Lt, + 5 => Operator::Lte, + _ => unreachable!("range 0..=5"), + }) +} diff --git a/vortex-expr/src/lib.rs b/vortex-expr/src/lib.rs index 6546e32c981..0b732970aed 100644 --- a/vortex-expr/src/lib.rs +++ b/vortex-expr/src/lib.rs @@ -8,6 +8,8 @@ use dyn_hash::DynHash; mod binary; mod analysis; +#[cfg(feature = "arbitrary")] +pub mod arbitrary; mod between; mod cast; mod field;