Skip to content

Commit 3612521

Browse files
committed
add more exclusion rules (beyond the old compressor)
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent e4b8de5 commit 3612521

File tree

12 files changed

+193
-53
lines changed

12 files changed

+193
-53
lines changed

vortex-btrblocks/public-api.lock

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,8 @@ impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::F
284284

285285
impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::FoRScheme
286286

287+
pub fn vortex_btrblocks::schemes::integer::FoRScheme::ancestor_exclusions(&self) -> alloc::vec::Vec<vortex_compressor::scheme::AncestorExclusion>
288+
287289
pub fn vortex_btrblocks::schemes::integer::FoRScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult<vortex_array::array::ArrayRef>
288290

289291
pub fn vortex_btrblocks::schemes::integer::FoRScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult<f64>
@@ -468,6 +470,8 @@ impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::Z
468470

469471
impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::ZigZagScheme
470472

473+
pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::ancestor_exclusions(&self) -> alloc::vec::Vec<vortex_compressor::scheme::AncestorExclusion>
474+
471475
pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult<vortex_array::array::ArrayRef>
472476

473477
pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::descendant_exclusions(&self) -> alloc::vec::Vec<vortex_compressor::scheme::DescendantExclusion>

vortex-btrblocks/src/schemes/decimal.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ impl Scheme for DecimalScheme {
3636
matches!(canonical, Canonical::Decimal(_))
3737
}
3838

39+
/// Children: primitive=0.
3940
fn num_children(&self) -> usize {
4041
1
4142
}

vortex-btrblocks/src/schemes/float.rs

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,6 @@ impl rle::RLEConfig for FloatRLEConfig {
7474
fn generate_stats(array: &ArrayRef) -> FloatStats {
7575
FloatStats::generate(&array.to_primitive())
7676
}
77-
78-
fn compress_values(
79-
compressor: &CascadingCompressor,
80-
values: &PrimitiveArray,
81-
ctx: CompressorContext,
82-
) -> VortexResult<ArrayRef> {
83-
compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx)
84-
}
8577
}
8678

8779
impl RLEStats for FloatStats {
@@ -110,6 +102,7 @@ impl Scheme for ALPScheme {
110102
is_float_primitive(canonical)
111103
}
112104

105+
/// Children: encoded_ints=0.
113106
fn num_children(&self) -> usize {
114107
1
115108
}
@@ -220,11 +213,11 @@ impl Scheme for NullDominatedSparseScheme {
220213
is_float_primitive(canonical)
221214
}
222215

216+
/// Children: indices=0.
223217
fn num_children(&self) -> usize {
224218
1
225219
}
226220

227-
// TODO(connor): There seems to be stuff missing here...
228221
/// The indices of a null-dominated sparse array should not be sparse-encoded again.
229222
fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
230223
vec![DescendantExclusion {

vortex-btrblocks/src/schemes/integer.rs

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,6 @@ impl rle::RLEConfig for IntRLEConfig {
9898
fn generate_stats(array: &ArrayRef) -> IntegerStats {
9999
IntegerStats::generate(&array.to_primitive())
100100
}
101-
102-
fn compress_values(
103-
compressor: &CascadingCompressor,
104-
values: &PrimitiveArray,
105-
ctx: CompressorContext,
106-
) -> VortexResult<ArrayRef> {
107-
compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx)
108-
}
109101
}
110102

111103
impl RLEStats for IntegerStats {
@@ -134,6 +126,24 @@ impl Scheme for FoRScheme {
134126
is_integer_primitive(canonical)
135127
}
136128

129+
/// Dict codes always start at 0, so FoR (which subtracts the min) is a no-op.
130+
fn ancestor_exclusions(&self) -> Vec<AncestorExclusion> {
131+
vec![
132+
AncestorExclusion {
133+
ancestor: IntDictScheme.id(),
134+
children: ChildSelection::One(1),
135+
},
136+
AncestorExclusion {
137+
ancestor: FloatDictScheme.id(),
138+
children: ChildSelection::One(1),
139+
},
140+
AncestorExclusion {
141+
ancestor: StringDictScheme.id(),
142+
children: ChildSelection::One(1),
143+
},
144+
]
145+
}
146+
137147
fn expected_compression_ratio(
138148
&self,
139149
_compressor: &CascadingCompressor,
@@ -225,6 +235,7 @@ impl Scheme for ZigZagScheme {
225235
is_integer_primitive(canonical)
226236
}
227237

238+
/// Children: encoded=0.
228239
fn num_children(&self) -> usize {
229240
1
230241
}
@@ -249,6 +260,24 @@ impl Scheme for ZigZagScheme {
249260
]
250261
}
251262

263+
/// Dict codes are unsigned integers (0..cardinality). ZigZag only helps negatives.
264+
fn ancestor_exclusions(&self) -> Vec<AncestorExclusion> {
265+
vec![
266+
AncestorExclusion {
267+
ancestor: IntDictScheme.id(),
268+
children: ChildSelection::One(1),
269+
},
270+
AncestorExclusion {
271+
ancestor: FloatDictScheme.id(),
272+
children: ChildSelection::One(1),
273+
},
274+
AncestorExclusion {
275+
ancestor: StringDictScheme.id(),
276+
children: ChildSelection::One(1),
277+
},
278+
]
279+
}
280+
252281
fn expected_compression_ratio(
253282
&self,
254283
compressor: &CascadingCompressor,
@@ -365,6 +394,7 @@ impl Scheme for SparseScheme {
365394
}
366395
}
367396

397+
/// Children: values=0, indices=1.
368398
fn num_children(&self) -> usize {
369399
2
370400
}
@@ -494,6 +524,7 @@ impl Scheme for RunEndScheme {
494524
is_integer_primitive(canonical)
495525
}
496526

527+
/// Children: values=0, ends=1.
497528
fn num_children(&self) -> usize {
498529
2
499530
}
@@ -507,12 +538,23 @@ impl Scheme for RunEndScheme {
507538
}]
508539
}
509540

510-
// TODO(connor): There seems to be stuff missing here...
541+
/// Dict values (child 0) are all unique by definition, so run-end encoding them is
542+
/// pointless. Codes (child 1) can have runs and may benefit from RunEnd.
511543
fn ancestor_exclusions(&self) -> Vec<AncestorExclusion> {
512-
vec![AncestorExclusion {
513-
ancestor: FloatDictScheme.id(),
514-
children: ChildSelection::One(0),
515-
}]
544+
vec![
545+
AncestorExclusion {
546+
ancestor: IntDictScheme.id(),
547+
children: ChildSelection::One(0),
548+
},
549+
AncestorExclusion {
550+
ancestor: FloatDictScheme.id(),
551+
children: ChildSelection::One(0),
552+
},
553+
AncestorExclusion {
554+
ancestor: StringDictScheme.id(),
555+
children: ChildSelection::One(0),
556+
},
557+
]
516558
}
517559

518560
fn expected_compression_ratio(

vortex-btrblocks/src/schemes/rle.rs

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ use vortex_array::Canonical;
99
use vortex_array::IntoArray;
1010
use vortex_array::ToCanonical;
1111
use vortex_array::arrays::PrimitiveArray;
12+
use vortex_compressor::builtins::FloatDictScheme;
13+
use vortex_compressor::builtins::StringDictScheme;
14+
use vortex_compressor::scheme::AncestorExclusion;
1215
use vortex_compressor::scheme::ChildSelection;
1316
use vortex_compressor::scheme::DescendantExclusion;
1417
#[cfg(feature = "unstable_encodings")]
@@ -37,8 +40,6 @@ pub trait RLEStats {
3740
fn source(&self) -> &PrimitiveArray;
3841
}
3942

40-
// TODO(connor): This trait is super confusing, we should probably just remove it and hardcode the
41-
// only 2 implementations (integer and float).
4243
/// Configuration trait for RLE schemes.
4344
///
4445
/// Implement this trait to define the behavior of an RLE scheme for a specific
@@ -55,13 +56,6 @@ pub trait RLEConfig: Debug + Send + Sync + 'static {
5556

5657
/// Generates statistics for the given array.
5758
fn generate_stats(array: &ArrayRef) -> Self::Stats;
58-
59-
/// Compress the values array after RLE encoding.
60-
fn compress_values(
61-
compressor: &CascadingCompressor,
62-
values: &PrimitiveArray,
63-
ctx: CompressorContext,
64-
) -> VortexResult<ArrayRef>;
6559
}
6660

6761
/// RLE scheme that is generic over a configuration type.
@@ -92,6 +86,7 @@ impl<C: RLEConfig> Scheme for RLEScheme<C> {
9286
C::matches(canonical)
9387
}
9488

89+
/// Children: values=0, indices=1, offsets=2.
9590
fn num_children(&self) -> usize {
9691
3
9792
}
@@ -106,6 +101,24 @@ impl<C: RLEConfig> Scheme for RLEScheme<C> {
106101
}]
107102
}
108103

104+
/// Dict values (child 0) are all unique by definition, so RLE is pointless on them.
105+
fn ancestor_exclusions(&self) -> Vec<AncestorExclusion> {
106+
vec![
107+
AncestorExclusion {
108+
ancestor: IntDictScheme.id(),
109+
children: ChildSelection::One(0),
110+
},
111+
AncestorExclusion {
112+
ancestor: FloatDictScheme.id(),
113+
children: ChildSelection::One(0),
114+
},
115+
AncestorExclusion {
116+
ancestor: StringDictScheme.id(),
117+
children: ChildSelection::One(0),
118+
},
119+
]
120+
}
121+
109122
fn expected_compression_ratio(
110123
&self,
111124
compressor: &CascadingCompressor,

vortex-btrblocks/src/schemes/string.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ impl Scheme for FSSTScheme {
6262
is_utf8_string(canonical)
6363
}
6464

65+
/// Children: lengths=0, code_offsets=1.
6566
fn num_children(&self) -> usize {
6667
2
6768
}
@@ -124,11 +125,11 @@ impl Scheme for NullDominatedSparseScheme {
124125
is_utf8_string(canonical)
125126
}
126127

128+
/// Children: indices=0.
127129
fn num_children(&self) -> usize {
128130
1
129131
}
130132

131-
// TODO(connor): There seems to be stuff missing here...
132133
/// The indices of a null-dominated sparse array should not be sparse-encoded again.
133134
fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
134135
vec![

vortex-btrblocks/src/schemes/temporal.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ impl Scheme for TemporalScheme {
5353
true
5454
}
5555

56+
/// Children: days=0, seconds=1, subseconds=2.
5657
fn num_children(&self) -> usize {
5758
3
5859
}

vortex-compressor/public-api.lock

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -402,14 +402,6 @@ pub fn vortex_compressor::ctx::CompressorContext::fmt(&self, f: &mut core::fmt::
402402

403403
pub const vortex_compressor::ctx::MAX_CASCADE: usize
404404

405-
pub mod vortex_compressor::root_list_children
406-
407-
pub const vortex_compressor::root_list_children::ELEMENTS: usize
408-
409-
pub const vortex_compressor::root_list_children::OFFSETS: usize
410-
411-
pub const vortex_compressor::root_list_children::SIZES: usize
412-
413405
pub mod vortex_compressor::scheme
414406

415407
pub enum vortex_compressor::scheme::ChildSelection
@@ -1040,8 +1032,6 @@ impl vortex_compressor::CascadingCompressor
10401032

10411033
pub fn vortex_compressor::CascadingCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult<vortex_array::array::ArrayRef>
10421034

1043-
pub fn vortex_compressor::CascadingCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult<vortex_array::array::ArrayRef>
1044-
10451035
pub fn vortex_compressor::CascadingCompressor::compress_child(&self, child: &vortex_array::array::ArrayRef, parent_ctx: &vortex_compressor::ctx::CompressorContext, parent_id: vortex_compressor::scheme::SchemeId, child_index: usize) -> vortex_error::VortexResult<vortex_array::array::ArrayRef>
10461036

10471037
pub fn vortex_compressor::CascadingCompressor::execution_ctx(&self) -> parking_lot::mutex::MutexGuard<'_, vortex_array::executor::ExecutionCtx>

vortex-compressor/src/builtins/dict/mod.rs

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,9 @@ impl Scheme for IntDictScheme {
4747
}
4848
}
4949

50+
/// Children: values=0, codes=1.
5051
fn num_children(&self) -> usize {
51-
1
52+
2
5253
}
5354

5455
fn expected_compression_ratio(
@@ -142,11 +143,19 @@ impl Scheme for FloatDictScheme {
142143
}
143144
}
144145

146+
/// Children: values=0, codes=1.
145147
fn num_children(&self) -> usize {
146148
2
147149
}
148150

149-
// TODO(connor): There seems to be stuff missing here...
151+
/// Float dict codes (child 1) are compact unsigned integers that should not be
152+
/// dict-encoded again. Float dict values (child 0) flow through ALP into integer-land,
153+
/// where integer dict encoding is redundant since the values are already deduplicated at
154+
/// the float level.
155+
///
156+
/// Additional exclusions for codes (IntSequenceScheme, IntRunEndScheme, FoRScheme,
157+
/// ZigZagScheme, SparseScheme, RLE) are expressed as pull rules on those schemes in
158+
/// vortex-btrblocks.
150159
fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
151160
vec![
152161
DescendantExclusion {
@@ -235,11 +244,16 @@ impl Scheme for StringDictScheme {
235244
}
236245
}
237246

247+
/// Children: values=0, codes=1.
238248
fn num_children(&self) -> usize {
239249
2
240250
}
241251

242-
// TODO(connor): There seems to be stuff missing here...
252+
/// String dict codes (child 1) are compact unsigned integers that should not be dict-encoded
253+
/// again.
254+
///
255+
/// Additional exclusions for codes (IntSequenceScheme, FoRScheme, ZigZagScheme, SparseScheme,
256+
/// RunEndScheme, RLE, etc.) are expressed as pull rules on those schemes in `vortex-btrblocks`.
243257
fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
244258
vec![DescendantExclusion {
245259
excluded: IntDictScheme.id(),

0 commit comments

Comments
 (0)