Skip to content

Commit 02da220

Browse files
Merge branch 'develop' into operators-bench
2 parents 01e7687 + 4f0c43a commit 02da220

File tree

12 files changed

+126
-29
lines changed

12 files changed

+126
-29
lines changed

docs/specs/file-format.md

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -117,26 +117,3 @@ The plan is that at write-time, a minimum supported reader version is declared.
117117
reader version can then be embedded into the file with WebAssembly decompression logic. Old readers are able to decompress new
118118
data (slower than native code, but still with SIMD acceleration) and read the file. New readers are able to make the best use of
119119
these encodings with native decompression logic and additional push-down compute functions (which also provides an incentive to upgrade).
120-
121-
## File Determinism and Reproducibility
122-
123-
### Encoding Order Indeterminism
124-
125-
When writing Vortex files, each array segment references its encoding via an integer index into the footer's `array_specs`
126-
list. During serialization, encodings are registered in the order they are first encountered via calls to
127-
`ArrayContext::encoding_idx()`. With concurrent writes, this encounter order depends on thread scheduling and lock
128-
acquisition timing, making the ordering in the footer non-deterministic between runs.
129-
130-
This affects the `encoding` field in each serialized array segment. The same encoding might receive index 0 in one run and
131-
index 1 in another, changing the integer value stored in each array segment that uses that encoding. FlatBuffers optimize
132-
storage by omitting fields with default values (such as 0), so when an encoding index is 0, the field may be omitted from
133-
the serialized representation. This saves approximately 2 bytes per affected array segment, and with alignment adjustments,
134-
can result in up to 4 bytes difference per array segment between runs.
135-
136-
:::{note}
137-
Despite this non-determinism, the practical impact is minimal:
138-
139-
- File size may vary by up to 4 bytes per affected array segment
140-
- All file contents remain semantically identical and fully readable
141-
- Segment ordering (the actual data layout) remains deterministic and consistent across writes
142-
:::

encodings/datetime-parts/src/array.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ use vortex_error::vortex_bail;
4040
use vortex_error::vortex_ensure;
4141
use vortex_error::vortex_err;
4242

43+
use crate::compute::rules::PARENT_RULES;
44+
4345
vtable!(DateTimeParts);
4446

4547
#[derive(Clone, prost::Message)]
@@ -159,6 +161,14 @@ impl VTable for DateTimePartsVTable {
159161

160162
Ok(())
161163
}
164+
165+
fn reduce_parent(
166+
array: &Self::Array,
167+
parent: &ArrayRef,
168+
child_idx: usize,
169+
) -> VortexResult<Option<ArrayRef>> {
170+
PARENT_RULES.evaluate(array, parent, child_idx)
171+
}
162172
}
163173

164174
#[derive(Clone, Debug)]

encodings/datetime-parts/src/compute/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod compare;
66
mod filter;
77
mod is_constant;
88
mod mask;
9+
pub(super) mod rules;
910
mod take;
1011

1112
#[cfg(test)]
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use vortex_array::ArrayRef;
5+
use vortex_array::IntoArray;
6+
use vortex_array::arrays::ConstantArray;
7+
use vortex_array::arrays::ConstantVTable;
8+
use vortex_array::arrays::FilterArray;
9+
use vortex_array::arrays::FilterVTable;
10+
use vortex_array::matchers::Exact;
11+
use vortex_array::optimizer::ArrayOptimizer;
12+
use vortex_array::optimizer::rules::ArrayParentReduceRule;
13+
use vortex_array::optimizer::rules::ParentRuleSet;
14+
use vortex_error::VortexExpect;
15+
use vortex_error::VortexResult;
16+
17+
use crate::DateTimePartsArray;
18+
use crate::DateTimePartsVTable;
19+
20+
pub(crate) const PARENT_RULES: ParentRuleSet<DateTimePartsVTable> =
21+
ParentRuleSet::new(&[ParentRuleSet::lift(&DTPFilterPushDownRule)]);
22+
23+
/// Push the filter into the days column of a date time parts, we could extend this to other fields
24+
/// but its less clear if that is beneficial.
25+
#[derive(Debug)]
26+
struct DTPFilterPushDownRule;
27+
28+
impl ArrayParentReduceRule<DateTimePartsVTable> for DTPFilterPushDownRule {
29+
type Parent = Exact<FilterVTable>;
30+
31+
fn parent(&self) -> Self::Parent {
32+
Exact::from(&FilterVTable)
33+
}
34+
35+
fn reduce_parent(
36+
&self,
37+
child: &DateTimePartsArray,
38+
parent: &FilterArray,
39+
child_idx: usize,
40+
) -> VortexResult<Option<ArrayRef>> {
41+
if child_idx != 0 {
42+
return Ok(None);
43+
}
44+
45+
if !child.seconds().is::<ConstantVTable>() || !child.subseconds().is::<ConstantVTable>() {
46+
return Ok(None);
47+
}
48+
49+
DateTimePartsArray::try_new(
50+
child.dtype().clone(),
51+
FilterArray::new(child.days().clone(), parent.filter_mask().clone())
52+
.into_array()
53+
.optimize()?,
54+
ConstantArray::new(
55+
child.seconds().as_constant().vortex_expect("constant"),
56+
parent.filter_mask().true_count(),
57+
)
58+
.into_array(),
59+
ConstantArray::new(
60+
child.subseconds().as_constant().vortex_expect("constant"),
61+
parent.filter_mask().true_count(),
62+
)
63+
.into_array(),
64+
)
65+
.map(|x| Some(x.into_array()))
66+
}
67+
}

vortex-array/src/arrays/datetime/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,12 @@ impl TemporalArray {
201201
}
202202
}
203203

204+
impl AsRef<dyn Array> for TemporalArray {
205+
fn as_ref(&self) -> &dyn Array {
206+
self.ext.as_ref()
207+
}
208+
}
209+
204210
impl From<TemporalArray> for ArrayRef {
205211
fn from(value: TemporalArray) -> Self {
206212
value.ext.into_array()

vortex-array/src/context.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,15 @@ impl<T: Clone + Eq> VTableContext<T> {
2626
Self(Arc::new(RwLock::new(encodings)))
2727
}
2828

29+
pub fn from_registry_sorted(registry: &Registry<T>) -> Self
30+
where
31+
T: Display,
32+
{
33+
let mut encodings: Vec<T> = registry.items().collect();
34+
encodings.sort_by_key(|a| a.to_string());
35+
Self::new(encodings)
36+
}
37+
2938
pub fn try_from_registry<'a>(
3039
registry: &Registry<T>,
3140
ids: impl IntoIterator<Item = &'a str>,

vortex-btrblocks/benches/dict_encode.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ fn make_array() -> PrimitiveArray {
2828
PrimitiveArray::new(values, Validity::Array(nulls))
2929
}
3030

31+
#[cfg(not(codspeed))]
3132
#[divan::bench]
3233
fn encode_generic(bencher: Bencher) {
3334
let array = make_array().into_array();
@@ -36,6 +37,7 @@ fn encode_generic(bencher: Bencher) {
3637
.bench_refs(|array| dict_encode(array.as_ref()).unwrap());
3738
}
3839

40+
#[cfg(not(codspeed))]
3941
#[divan::bench]
4042
fn encode_specialized(bencher: Bencher) {
4143
let stats = IntegerStats::generate(&make_array());

vortex-btrblocks/src/lib.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,14 @@ use vortex_array::ArrayRef;
3838
use vortex_array::Canonical;
3939
use vortex_array::IntoArray;
4040
use vortex_array::ToCanonical;
41+
use vortex_array::arrays::ConstantArray;
4142
use vortex_array::arrays::ExtensionArray;
4243
use vortex_array::arrays::FixedSizeListArray;
4344
use vortex_array::arrays::ListArray;
4445
use vortex_array::arrays::StructArray;
4546
use vortex_array::arrays::TemporalArray;
4647
use vortex_array::arrays::list_from_list_view;
48+
use vortex_array::compute::Cost;
4749
use vortex_array::vtable::VTable;
4850
use vortex_array::vtable::ValidityHelper;
4951
use vortex_dtype::DType;
@@ -475,6 +477,13 @@ impl BtrBlocksCompressor {
475477
if let Ok(temporal_array) = TemporalArray::try_from(ext_array.to_array())
476478
&& let TemporalMetadata::Timestamp(..) = temporal_array.temporal_metadata()
477479
{
480+
if temporal_array.as_ref().is_constant_opts(Cost::Canonicalize) {
481+
return Ok(ConstantArray::new(
482+
temporal_array.as_ref().scalar_at(0),
483+
ext_array.len(),
484+
)
485+
.into_array());
486+
}
478487
return compress_temporal(temporal_array);
479488
}
480489

vortex-duckdb/src/convert/scalar.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,11 @@ impl ToDuckDBScalar for ExtScalar<'_> {
201201
_ => vortex_bail!("cannot have TimeUnit {unit}, so represent a day"),
202202
},
203203
TemporalMetadata::Timestamp(unit, tz) => {
204-
if tz.is_some() {
205-
todo!("timezones to duckdb scalar")
204+
if let Some(tz) = tz {
205+
if tz != "UTC" {
206+
todo!()
207+
}
208+
return Ok(Value::new_timestamp_tz(value()?));
206209
}
207210
match unit {
208211
TimeUnit::Nanoseconds => Ok(Value::new_timestamp_ns(value()?)),

vortex-duckdb/src/duckdb/value.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,14 @@ impl Value {
197197
}
198198
}
199199

200+
pub fn new_timestamp_tz(micros: i64) -> Self {
201+
unsafe {
202+
Self::own(cpp::duckdb_create_timestamp_tz(cpp::duckdb_timestamp {
203+
micros,
204+
}))
205+
}
206+
}
207+
200208
pub fn new_timestamp_ns(nanos: i64) -> Self {
201209
unsafe {
202210
Self::own(cpp::duckdb_create_timestamp_ns(cpp::duckdb_timestamp_ns {

0 commit comments

Comments
 (0)