Skip to content

Commit b29440a

Browse files
authored
Feature: Add is_subset_of of is_superset_of and validate extend_from_array (vortex-data#4445)
This will be helpful in my refactoring of the `builders` module in `vortex-array` (in draft right now: vortex-data#4440) Note that `extend_from_array` is called in many non-test places yet is not validated at all. Given that this is a public method, this is a bug waiting to happen... I have an interesting thought though: should a `DType::Primitive(U64, nullable)` be a superset of `DType::Primitive(U16, non-nullable)`? In theory, we should be able to extend an array builder of nullable `U64` with an array of non-nullable `U8`. I'll leave that to a separate PR since this idea might have some other implications. --------- Signed-off-by: Connor Tsui <[email protected]>
1 parent d84d0c5 commit b29440a

File tree

11 files changed

+337
-254
lines changed

11 files changed

+337
-254
lines changed

vortex-array/src/arrays/arbitrary.rs

Lines changed: 90 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use builders::ListBuilder;
1010
use vortex_buffer::Buffer;
1111
use vortex_dtype::{DType, NativePType, Nullability, PType};
1212
use vortex_error::{VortexExpect, VortexUnwrap};
13-
use vortex_scalar::arbitrary::{random_decimal, random_scalar};
13+
use vortex_scalar::arbitrary::random_scalar;
1414
use vortex_scalar::{Scalar, match_each_decimal_value_type};
1515

1616
use super::{
@@ -47,90 +47,14 @@ fn split_number_into_parts(n: usize, parts: usize) -> Vec<usize> {
4747
.collect()
4848
}
4949

50+
/// Creates a random array with a random number of chunks.
5051
fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Result<ArrayRef> {
5152
let num_chunks = u.int_in_range(1..=3)?;
5253
let chunk_lens = len.map(|l| split_number_into_parts(l, num_chunks));
5354
let mut chunks = (0..num_chunks)
5455
.map(|i| {
5556
let chunk_len = chunk_lens.as_ref().map(|c| c[i]);
56-
match dtype {
57-
DType::Null => Ok(NullArray::new(
58-
chunk_len
59-
.map(Ok)
60-
.unwrap_or_else(|| u.int_in_range(0..=100))?,
61-
)
62-
.into_array()),
63-
DType::Bool(n) => random_bool(u, *n, chunk_len),
64-
DType::Primitive(ptype, n) => match ptype {
65-
PType::U8 => random_primitive::<u8>(u, *n, chunk_len),
66-
PType::U16 => random_primitive::<u16>(u, *n, chunk_len),
67-
PType::U32 => random_primitive::<u32>(u, *n, chunk_len),
68-
PType::U64 => random_primitive::<u64>(u, *n, chunk_len),
69-
PType::I8 => random_primitive::<i8>(u, *n, chunk_len),
70-
PType::I16 => random_primitive::<i16>(u, *n, chunk_len),
71-
PType::I32 => random_primitive::<i32>(u, *n, chunk_len),
72-
PType::I64 => random_primitive::<i64>(u, *n, chunk_len),
73-
PType::F16 => Ok(random_primitive::<u16>(u, *n, chunk_len)?
74-
.to_primitive()
75-
.vortex_unwrap()
76-
.reinterpret_cast(PType::F16)
77-
.into_array()),
78-
PType::F32 => random_primitive::<f32>(u, *n, chunk_len),
79-
PType::F64 => random_primitive::<f64>(u, *n, chunk_len),
80-
},
81-
DType::Decimal(decimal, n) => {
82-
let elem_len = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
83-
match_each_decimal_value_type!(smallest_storage_type(decimal), |DVT| {
84-
let mut builder =
85-
DecimalBuilder::new::<DVT>(decimal.precision(), decimal.scale(), *n);
86-
for _i in 0..elem_len {
87-
builder
88-
.append_scalar_value(random_decimal(u, decimal)?)
89-
.vortex_unwrap();
90-
}
91-
Ok(builder.finish())
92-
})
93-
}
94-
DType::Utf8(n) => random_string(u, *n, chunk_len),
95-
DType::Binary(n) => random_bytes(u, *n, chunk_len),
96-
DType::Struct(sdt, n) => {
97-
let first_array = sdt
98-
.fields()
99-
.next()
100-
.map(|d| random_array(u, &d, chunk_len))
101-
.transpose()?;
102-
let resolved_len = first_array
103-
.as_ref()
104-
.map(|a| a.len())
105-
.or(chunk_len)
106-
.map(Ok)
107-
.unwrap_or_else(|| u.int_in_range(0..=100))?;
108-
let children = first_array
109-
.into_iter()
110-
.map(Ok)
111-
.chain(
112-
sdt.fields()
113-
.skip(1)
114-
.map(|d| random_array(u, &d, Some(resolved_len))),
115-
)
116-
.collect::<Result<Vec<_>>>()?;
117-
Ok(StructArray::try_new(
118-
sdt.names().clone(),
119-
children,
120-
resolved_len,
121-
random_validity(u, *n, resolved_len)?,
122-
)
123-
.vortex_unwrap()
124-
.into_array())
125-
}
126-
DType::List(ldt, n) => random_list(u, ldt, *n, chunk_len),
127-
DType::FixedSizeList(..) => {
128-
unimplemented!("TODO(connor)[FixedSizeList]: Create canonical fixed-size list")
129-
}
130-
DType::Extension(..) => {
131-
todo!("Extension arrays are not implemented")
132-
}
133-
}
57+
random_array_chunk(u, dtype, chunk_len)
13458
})
13559
.collect::<Result<Vec<_>>>()?;
13660

@@ -144,6 +68,93 @@ fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Resu
14468
}
14569
}
14670

71+
/// Creates a random array chunk.
72+
fn random_array_chunk(
73+
u: &mut Unstructured<'_>,
74+
dtype: &DType,
75+
chunk_len: Option<usize>,
76+
) -> std::result::Result<Arc<dyn Array + 'static>, arbitrary::Error> {
77+
match dtype {
78+
DType::Null => Ok(NullArray::new(
79+
chunk_len
80+
.map(Ok)
81+
.unwrap_or_else(|| u.int_in_range(0..=100))?,
82+
)
83+
.into_array()),
84+
DType::Bool(n) => random_bool(u, *n, chunk_len),
85+
DType::Primitive(ptype, n) => match ptype {
86+
PType::U8 => random_primitive::<u8>(u, *n, chunk_len),
87+
PType::U16 => random_primitive::<u16>(u, *n, chunk_len),
88+
PType::U32 => random_primitive::<u32>(u, *n, chunk_len),
89+
PType::U64 => random_primitive::<u64>(u, *n, chunk_len),
90+
PType::I8 => random_primitive::<i8>(u, *n, chunk_len),
91+
PType::I16 => random_primitive::<i16>(u, *n, chunk_len),
92+
PType::I32 => random_primitive::<i32>(u, *n, chunk_len),
93+
PType::I64 => random_primitive::<i64>(u, *n, chunk_len),
94+
PType::F16 => Ok(random_primitive::<u16>(u, *n, chunk_len)?
95+
.to_primitive()
96+
.vortex_unwrap()
97+
.reinterpret_cast(PType::F16)
98+
.into_array()),
99+
PType::F32 => random_primitive::<f32>(u, *n, chunk_len),
100+
PType::F64 => random_primitive::<f64>(u, *n, chunk_len),
101+
},
102+
DType::Decimal(decimal, n) => {
103+
let elem_len = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
104+
match_each_decimal_value_type!(smallest_storage_type(decimal), |DVT| {
105+
let mut builder =
106+
DecimalBuilder::new::<DVT>(decimal.precision(), decimal.scale(), *n);
107+
for _i in 0..elem_len {
108+
let random_decimal = random_scalar(u, &DType::Decimal(*decimal, *n))?;
109+
builder.append_scalar(&random_decimal).vortex_expect(
110+
"was somehow unable to append a decimal to a decimal builder",
111+
);
112+
}
113+
Ok(builder.finish())
114+
})
115+
}
116+
DType::Utf8(n) => random_string(u, *n, chunk_len),
117+
DType::Binary(n) => random_bytes(u, *n, chunk_len),
118+
DType::Struct(sdt, n) => {
119+
let first_array = sdt
120+
.fields()
121+
.next()
122+
.map(|d| random_array(u, &d, chunk_len))
123+
.transpose()?;
124+
let resolved_len = first_array
125+
.as_ref()
126+
.map(|a| a.len())
127+
.or(chunk_len)
128+
.map(Ok)
129+
.unwrap_or_else(|| u.int_in_range(0..=100))?;
130+
let children = first_array
131+
.into_iter()
132+
.map(Ok)
133+
.chain(
134+
sdt.fields()
135+
.skip(1)
136+
.map(|d| random_array(u, &d, Some(resolved_len))),
137+
)
138+
.collect::<Result<Vec<_>>>()?;
139+
Ok(StructArray::try_new(
140+
sdt.names().clone(),
141+
children,
142+
resolved_len,
143+
random_validity(u, *n, resolved_len)?,
144+
)
145+
.vortex_unwrap()
146+
.into_array())
147+
}
148+
DType::List(ldt, n) => random_list(u, ldt, *n, chunk_len),
149+
DType::FixedSizeList(..) => {
150+
unimplemented!("TODO(connor)[FixedSizeList]: Create canonical fixed-size list")
151+
}
152+
DType::Extension(..) => {
153+
todo!("Extension arrays are not implemented")
154+
}
155+
}
156+
}
157+
147158
fn random_list(
148159
u: &mut Unstructured,
149160
ldt: &Arc<DType>,

vortex-array/src/builders/bool.rs

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ use std::any::Any;
55

66
use arrow_buffer::BooleanBufferBuilder;
77
use vortex_dtype::{DType, Nullability};
8-
use vortex_error::{VortexResult, vortex_bail};
8+
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
99
use vortex_mask::Mask;
1010

1111
use crate::arrays::BoolArray;
12-
use crate::builders::ArrayBuilder;
1312
use crate::builders::lazy_validity_builder::LazyNullBufferBuilder;
14-
use crate::{Array, ArrayRef, Canonical, IntoArray};
13+
use crate::builders::{ArrayBuilder, DEFAULT_BUILDER_CAPACITY};
14+
use crate::{Array, ArrayRef, IntoArray, ToCanonical};
1515

1616
pub struct BoolBuilder {
1717
inner: BooleanBufferBuilder,
@@ -22,7 +22,7 @@ pub struct BoolBuilder {
2222

2323
impl BoolBuilder {
2424
pub fn new(nullability: Nullability) -> Self {
25-
Self::with_capacity(nullability, 1024) // Same as Arrow builders
25+
Self::with_capacity(nullability, DEFAULT_BUILDER_CAPACITY)
2626
}
2727

2828
pub fn with_capacity(nullability: Nullability, capacity: usize) -> Self {
@@ -78,13 +78,20 @@ impl ArrayBuilder for BoolBuilder {
7878
}
7979

8080
fn extend_from_array(&mut self, array: &dyn Array) -> VortexResult<()> {
81-
let array = array.to_canonical()?;
82-
let Canonical::Bool(array) = array else {
83-
vortex_bail!("Expected Canonical::Bool, found {:?}", array);
84-
};
81+
if !self.dtype.eq_with_nullability_superset(array.dtype()) {
82+
vortex_bail!(
83+
"tried to extend a builder with `DType` {} with an array with `DType {}",
84+
self.dtype,
85+
array.dtype()
86+
);
87+
}
88+
89+
let bool_array = array
90+
.to_bool()
91+
.vortex_expect("we checked that the array had `DType` boolean");
8592

86-
self.inner.append_buffer(array.boolean_buffer());
87-
self.nulls.append_validity_mask(array.validity_mask());
93+
self.inner.append_buffer(bool_array.boolean_buffer());
94+
self.nulls.append_validity_mask(bool_array.validity_mask());
8895

8996
Ok(())
9097
}

0 commit comments

Comments
 (0)