Skip to content

Commit 8ee6bd6

Browse files
authored
fix: don't recursively dictionary-encode dictionary codes (#4352)
I added an optional `exclude_int_dict_encoding` setting to the `BtrBlocksCompressor` to be minimally invasive but I wonder if we should just call `IntCompressor::compress_no_dict` always? I'm not sure it ever makes sense to dictionary encode integers. Signed-off-by: Alfonso Subiotto Marques <[email protected]>
1 parent aa7ec20 commit 8ee6bd6

File tree

10 files changed

+64
-15
lines changed

10 files changed

+64
-15
lines changed

fuzz/fuzz_targets/array_ops.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ fuzz_target!(|fuzz_action: FuzzArrayAction| -> Corpus {
2727
for (i, (action, expected)) in actions.into_iter().enumerate() {
2828
match action {
2929
Action::Compress => {
30-
current_array = BtrBlocksCompressor
30+
current_array = BtrBlocksCompressor::default()
3131
.compress(current_array.to_canonical().vortex_unwrap().as_ref())
3232
.vortex_unwrap();
3333
assert_array_eq(&expected.array(), &current_array, i).unwrap();
@@ -56,7 +56,9 @@ fuzz_target!(|fuzz_action: FuzzArrayAction| -> Corpus {
5656
])
5757
.contains(&current_array.encoding_id())
5858
{
59-
sorted = BtrBlocksCompressor.compress(&sorted).vortex_unwrap();
59+
sorted = BtrBlocksCompressor::default()
60+
.compress(&sorted)
61+
.vortex_unwrap();
6062
}
6163
assert_search_sorted(sorted, s, side, expected.search(), i).unwrap()
6264
}

fuzz/src/array/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,9 @@ impl<'a> Arbitrary<'a> for FuzzArrayAction {
128128
)
129129
.into_array();
130130

131-
let compressed = BtrBlocksCompressor.compress(&indices_array).vortex_unwrap();
131+
let compressed = BtrBlocksCompressor::default()
132+
.compress(&indices_array)
133+
.vortex_unwrap();
132134
(
133135
Action::Take(compressed),
134136
ExpectedValue::Array(current_array.to_array()),

vortex-btrblocks/src/lib.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,10 @@ pub trait Compressor {
270270
}
271271
}
272272

273-
#[derive(Debug, Clone)]
274-
pub struct BtrBlocksCompressor;
273+
#[derive(Default, Debug, Clone)]
274+
pub struct BtrBlocksCompressor {
275+
pub exclude_int_dict_encoding: bool,
276+
}
275277

276278
impl BtrBlocksCompressor {
277279
pub fn compress(&self, array: &dyn Array) -> VortexResult<ArrayRef> {
@@ -291,7 +293,11 @@ impl BtrBlocksCompressor {
291293
Canonical::Bool(bool_array) => Ok(bool_array.into_array()),
292294
Canonical::Primitive(primitive) => {
293295
if primitive.ptype().is_int() {
294-
IntCompressor::compress(&primitive, false, MAX_CASCADE, &[])
296+
if self.exclude_int_dict_encoding {
297+
IntCompressor::compress_no_dict(&primitive, false, MAX_CASCADE, &[])
298+
} else {
299+
IntCompressor::compress(&primitive, false, MAX_CASCADE, &[])
300+
}
295301
} else {
296302
FloatCompressor::compress(&primitive, false, MAX_CASCADE, &[])
297303
}

vortex-file/src/strategy.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ impl WriteStrategyBuilder {
7676
let compressing = if let Some(ref compressor) = self.compressor {
7777
CompressingStrategy::new_opaque(buffered, compressor.clone(), executor.clone(), 16)
7878
} else {
79-
CompressingStrategy::new_btrblocks(buffered, executor.clone(), 16)
79+
CompressingStrategy::new_btrblocks(buffered, executor.clone(), 16, true)
8080
};
8181

8282
// 4. prior to compression, coalesce up to a minimum size
@@ -97,7 +97,12 @@ impl WriteStrategyBuilder {
9797
1,
9898
)
9999
} else {
100-
CompressingStrategy::new_btrblocks(FlatLayoutStrategy::default(), executor.clone(), 1)
100+
CompressingStrategy::new_btrblocks(
101+
FlatLayoutStrategy::default(),
102+
executor.clone(),
103+
1,
104+
false,
105+
)
101106
};
102107

103108
// 3. apply dict encoding or fallback

vortex-file/src/tests.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use vortex_array::stream::ArrayStreamExt;
1717
use vortex_array::validity::Validity;
1818
use vortex_array::{Array, ArrayRef, IntoArray, ToCanonical};
1919
use vortex_buffer::{Buffer, ByteBufferMut, buffer};
20+
use vortex_dict::{DictEncoding, DictVTable};
2021
use vortex_dtype::PType::I32;
2122
use vortex_dtype::{DType, DecimalDType, Nullability, PType, StructFields};
2223
use vortex_error::VortexResult;
@@ -1183,3 +1184,26 @@ async fn test_into_tokio_array_stream() -> VortexResult<()> {
11831184

11841185
Ok(())
11851186
}
1187+
1188+
#[test]
1189+
fn test_array_stream_no_double_dict_encode() -> VortexResult<()> {
1190+
let num_vals = 2048;
1191+
let mut values = Vec::<i64>::with_capacity(num_vals);
1192+
values.extend(iter::repeat_n(0, num_vals / 2));
1193+
values.extend(iter::repeat_n(1, num_vals / 2));
1194+
1195+
let array = PrimitiveArray::from_iter(values).into_array();
1196+
let buf = VortexWriteOptions::default().write_blocking(Vec::new(), array.to_array_stream())?;
1197+
let file = VortexOpenOptions::in_memory().open(buf)?;
1198+
let read_array = file.scan()?.into_array_iter()?.read_all()?;
1199+
1200+
let dict = read_array
1201+
.as_opt::<DictVTable>()
1202+
.expect("expected root to be dictionary");
1203+
assert_ne!(
1204+
dict.codes().encoding().id(),
1205+
DictEncoding.id(),
1206+
"dictionary codes should not be dictionary encoded"
1207+
);
1208+
Ok(())
1209+
}

vortex-layout/src/layouts/compressed.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ where
4343

4444
impl CompressorPlugin for BtrBlocksCompressor {
4545
fn compress_chunk(&self, chunk: &dyn Array) -> VortexResult<ArrayRef> {
46-
BtrBlocksCompressor::compress(self, chunk)
46+
self.compress(chunk)
4747
}
4848
}
4949

@@ -67,10 +67,20 @@ impl<S: LayoutStrategy> CompressingStrategy<S> {
6767
/// Create a new writer that uses the BtrBlocks-style cascading compressor to compress chunks.
6868
///
6969
/// This provides a good balance between decoding speed and small file size.
70-
pub fn new_btrblocks(child: S, executor: Arc<dyn TaskExecutor>, parallelism: usize) -> Self {
70+
///
71+
/// Set `exclude_int_dict_encoding` to true to prevent dictionary encoding of integer arrays,
72+
/// which is useful when compressing dictionary codes to avoid recursive dictionary encoding.
73+
pub fn new_btrblocks(
74+
child: S,
75+
executor: Arc<dyn TaskExecutor>,
76+
parallelism: usize,
77+
exclude_int_dict_encoding: bool,
78+
) -> Self {
7179
Self {
7280
child,
73-
compressor: Arc::new(BtrBlocksCompressor),
81+
compressor: Arc::new(BtrBlocksCompressor {
82+
exclude_int_dict_encoding,
83+
}),
7484
executor,
7585
parallelism,
7686
}

vortex-layout/src/layouts/dict/writer/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ where
112112
let should_fallback = match first_chunk {
113113
None => true, // empty stream
114114
Some(chunk) => {
115-
let compressed = BtrBlocksCompressor.compress(&chunk)?;
115+
let compressed = BtrBlocksCompressor::default().compress(&chunk)?;
116116
!compressed.is_encoding(DictEncoding.id())
117117
}
118118
};

vortex-python/src/compress.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,6 @@ pub(crate) fn init(py: Python, parent: &Bound<PyModule>) -> PyResult<()> {
5050
/// 'vortex.alp(f64?, len=1000)'
5151
#[pyfunction]
5252
pub fn compress(array: PyArrayRef) -> PyResult<PyArrayRef> {
53-
let compressed = BtrBlocksCompressor.compress(array.inner())?;
53+
let compressed = BtrBlocksCompressor::default().compress(array.inner())?;
5454
Ok(PyArrayRef::from(compressed))
5555
}

vortex/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ mod test {
9191
let array = PrimitiveArray::new(buffer![42u64; 100_000], Validity::NonNullable);
9292

9393
// You can compress an array in-memory with the BtrBlocks compressor
94-
let compressed = BtrBlocksCompressor.compress(array.as_ref())?;
94+
let compressed = BtrBlocksCompressor::default().compress(array.as_ref())?;
9595
println!(
9696
"BtrBlocks size: {} / {}",
9797
compressed.nbytes(),

wasm-test/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ pub fn main() {
1313
// Extremely simple test of compression/decompression and a few compute functions.
1414
let array = PrimitiveArray::new(buffer![1i32; 1024], Validity::AllValid).to_array();
1515

16-
let compressed = BtrBlocksCompressor.compress(&array).unwrap();
16+
let compressed = BtrBlocksCompressor::default().compress(&array).unwrap();
1717
println!("Compressed size: {}", compressed.len());
1818
println!("Tree view: {}", compressed.display_tree());
1919
}

0 commit comments

Comments
 (0)