Skip to content

Commit 86c065d

Browse files
authored
move vortex-dict into vortex-array (#5289)
I think this is the lowest lift change to enable arrow -> vortex conversions for dict arrays. This PR only moves the vortex-dict crate into vortex-array, first commit only renames, second fixes imports. with this vortex-array maybe grows 5% in size. looking at crates.io I don't think vortex-dict is used separately from vortex-array much, so I removed the vortex-dict crate instead of re-exporting from vortex-array. I think the change is minimal enough to be reverted when we extract arrow out from vortex-array, happy to hear peoples thoughts. With this change we have all arrow supported encodings live in vortex-array --------- Signed-off-by: Onur Satici <[email protected]>
1 parent f7dcd01 commit 86c065d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+311
-361
lines changed

Cargo.lock

Lines changed: 0 additions & 27 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,6 @@ vortex-compute = { version = "0.1.0", path = "./vortex-compute", default-feature
226226
vortex-datafusion = { version = "0.1.0", path = "./vortex-datafusion", default-features = false }
227227
vortex-datetime-parts = { version = "0.1.0", path = "./encodings/datetime-parts", default-features = false }
228228
vortex-decimal-byte-parts = { version = "0.1.0", path = "encodings/decimal-byte-parts", default-features = false }
229-
vortex-dict = { version = "0.1.0", path = "./encodings/dict", default-features = false }
230229
vortex-dtype = { version = "0.1.0", path = "./vortex-dtype", default-features = false }
231230
vortex-error = { version = "0.1.0", path = "./vortex-error", default-features = false }
232231
vortex-fastlanes = { version = "0.1.0", path = "./encodings/fastlanes", default-features = false }

encodings/dict/Cargo.toml

Lines changed: 0 additions & 64 deletions
This file was deleted.

encodings/fsst/Cargo.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ workspace = true
2020
async-trait = { workspace = true }
2121
fsst-rs = { workspace = true }
2222
prost = { workspace = true }
23+
rand = { workspace = true, optional = true }
2324
vortex-array = { workspace = true }
2425
vortex-buffer = { workspace = true }
2526
vortex-dtype = { workspace = true }
@@ -28,6 +29,9 @@ vortex-mask = { workspace = true }
2829
vortex-scalar = { workspace = true }
2930
vortex-vector = { workspace = true }
3031

32+
[features]
33+
test-harness = ["dep:rand", "vortex-array/test-harness"]
34+
3135
[dev-dependencies]
3236
divan = { workspace = true }
3337
itertools = { workspace = true }
@@ -38,3 +42,8 @@ vortex-array = { workspace = true, features = ["test-harness"] }
3842
[[bench]]
3943
name = "fsst_compress"
4044
harness = false
45+
46+
[[bench]]
47+
name = "chunked_dict_fsst_builder"
48+
harness = false
49+
required-features = ["test-harness"]

encodings/dict/benches/chunked_dict_array_builder.rs renamed to encodings/fsst/benches/chunked_dict_fsst_builder.rs

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,12 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
use divan::Bencher;
5-
use rand::distr::{Distribution, StandardUniform};
65
use vortex_array::arrays::ChunkedArray;
76
use vortex_array::builders::builder_with_capacity;
87
use vortex_array::compute::warm_up_vtables;
98
use vortex_array::{Array, ArrayRef, IntoArray};
10-
use vortex_dict::test::{gen_dict_fsst_test_data, gen_dict_primitive_chunks};
119
use vortex_dtype::NativePType;
10+
use vortex_fsst::test_utils::gen_dict_fsst_test_data;
1211

1312
fn main() {
1413
warm_up_vtables();
@@ -24,36 +23,6 @@ const BENCH_ARGS: &[(usize, usize, usize)] = &[
2423
(1000, 1000, 100),
2524
];
2625

27-
#[divan::bench(types = [u32, u64, f32, f64], args = BENCH_ARGS)]
28-
fn chunked_dict_primitive_canonical_into<T: NativePType>(
29-
bencher: Bencher,
30-
(len, unique_values, chunk_count): (usize, usize, usize),
31-
) where
32-
StandardUniform: Distribution<T>,
33-
{
34-
let chunk = gen_dict_primitive_chunks::<T, u16>(len, unique_values, chunk_count);
35-
36-
bencher.with_inputs(|| chunk.clone()).bench_values(|chunk| {
37-
let mut builder = builder_with_capacity(chunk.dtype(), len * chunk_count);
38-
chunk.append_to_builder(builder.as_mut());
39-
builder.finish()
40-
})
41-
}
42-
43-
#[divan::bench(types = [u32, u64, f32, f64], args = BENCH_ARGS)]
44-
fn chunked_dict_primitive_into_canonical<T: NativePType>(
45-
bencher: Bencher,
46-
(len, unique_values, chunk_count): (usize, usize, usize),
47-
) where
48-
StandardUniform: Distribution<T>,
49-
{
50-
let chunk = gen_dict_primitive_chunks::<T, u16>(len, unique_values, chunk_count);
51-
52-
bencher
53-
.with_inputs(|| chunk.clone())
54-
.bench_values(|chunk| chunk.to_canonical())
55-
}
56-
5726
fn make_dict_fsst_chunks<T: NativePType>(
5827
len: usize,
5928
unique_values: usize,

encodings/fsst/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ mod compress;
1717
mod compute;
1818
mod ops;
1919
mod serde;
20+
#[cfg(feature = "test-harness")]
21+
pub mod test_utils;
2022
#[cfg(test)]
2123
mod tests;
2224

encodings/fsst/src/test_utils.rs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
#![allow(clippy::unwrap_used)]
5+
6+
use rand::prelude::StdRng;
7+
use rand::{Rng, SeedableRng};
8+
use vortex_array::arrays::{DictArray, PrimitiveArray, VarBinArray};
9+
use vortex_array::{ArrayRef, IntoArray};
10+
use vortex_dtype::{DType, NativePType, Nullability};
11+
use vortex_error::VortexUnwrap;
12+
13+
use crate::{fsst_compress, fsst_train_compressor};
14+
15+
pub fn gen_fsst_test_data(len: usize, avg_str_len: usize, unique_chars: u8) -> ArrayRef {
16+
let mut rng = StdRng::seed_from_u64(0);
17+
let mut strings = Vec::with_capacity(len);
18+
19+
for _ in 0..len {
20+
// Generate a random string with length around `avg_len`. The number of possible
21+
// characters within the random string is defined by `unique_chars`.
22+
let len = avg_str_len * rng.random_range(50..=150) / 100;
23+
strings.push(Some(
24+
(0..len)
25+
.map(|_| rng.random_range(b'a'..(b'a' + unique_chars)))
26+
.collect::<Vec<u8>>(),
27+
));
28+
}
29+
30+
let varbin = VarBinArray::from_iter(
31+
strings
32+
.into_iter()
33+
.map(|opt_s| opt_s.map(Vec::into_boxed_slice)),
34+
DType::Binary(Nullability::NonNullable),
35+
);
36+
let compressor = fsst_train_compressor(varbin.as_ref()).vortex_unwrap();
37+
38+
fsst_compress(varbin.as_ref(), &compressor)
39+
.vortex_unwrap()
40+
.into_array()
41+
}
42+
43+
pub fn gen_dict_fsst_test_data<T: NativePType>(
44+
len: usize,
45+
unique_values: usize,
46+
str_len: usize,
47+
unique_char_count: u8,
48+
) -> DictArray {
49+
let values = gen_fsst_test_data(len, str_len, unique_char_count);
50+
let mut rng = StdRng::seed_from_u64(0);
51+
let codes = (0..len)
52+
.map(|_| T::from(rng.random_range(0..unique_values)).unwrap())
53+
.collect::<PrimitiveArray>();
54+
DictArray::try_new(codes.into_array(), values).vortex_unwrap()
55+
}

vortex-array/Cargo.toml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,22 @@ harness = false
140140
name = "expr_large_struct_pack"
141141
path = "benches/expr/large_struct_pack.rs"
142142
harness = false
143+
144+
[[bench]]
145+
name = "chunked_dict_builder"
146+
harness = false
147+
required-features = ["test-harness"]
148+
149+
[[bench]]
150+
name = "dict_compress"
151+
harness = false
152+
required-features = ["test-harness"]
153+
154+
[[bench]]
155+
name = "dict_compare"
156+
harness = false
157+
required-features = ["test-harness"]
158+
159+
[[bench]]
160+
name = "dict_mask"
161+
harness = false
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use divan::Bencher;
5+
use rand::distr::{Distribution, StandardUniform};
6+
use vortex_array::Array;
7+
use vortex_array::arrays::dict_test::gen_dict_primitive_chunks;
8+
use vortex_array::builders::builder_with_capacity;
9+
use vortex_array::compute::warm_up_vtables;
10+
use vortex_dtype::NativePType;
11+
12+
fn main() {
13+
warm_up_vtables();
14+
divan::main();
15+
}
16+
17+
const BENCH_ARGS: &[(usize, usize, usize)] = &[
18+
(1000, 10, 10),
19+
(1000, 100, 10),
20+
(1000, 1000, 10),
21+
(1000, 10, 100),
22+
(1000, 100, 100),
23+
(1000, 1000, 100),
24+
];
25+
26+
#[divan::bench(types = [u32, u64, f32, f64], args = BENCH_ARGS)]
27+
fn chunked_dict_primitive_canonical_into<T: NativePType>(
28+
bencher: Bencher,
29+
(len, unique_values, chunk_count): (usize, usize, usize),
30+
) where
31+
StandardUniform: Distribution<T>,
32+
{
33+
let chunk = gen_dict_primitive_chunks::<T, u16>(len, unique_values, chunk_count);
34+
35+
bencher.with_inputs(|| chunk.clone()).bench_values(|chunk| {
36+
let mut builder = builder_with_capacity(chunk.dtype(), len * chunk_count);
37+
chunk.append_to_builder(builder.as_mut());
38+
builder.finish()
39+
})
40+
}
41+
42+
#[divan::bench(types = [u32, u64, f32, f64], args = BENCH_ARGS)]
43+
fn chunked_dict_primitive_into_canonical<T: NativePType>(
44+
bencher: Bencher,
45+
(len, unique_values, chunk_count): (usize, usize, usize),
46+
) where
47+
StandardUniform: Distribution<T>,
48+
{
49+
let chunk = gen_dict_primitive_chunks::<T, u16>(len, unique_values, chunk_count);
50+
51+
bencher
52+
.with_inputs(|| chunk.clone())
53+
.bench_values(|chunk| chunk.to_canonical())
54+
}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
use std::str::from_utf8;
77

88
use vortex_array::accessor::ArrayAccessor;
9+
use vortex_array::arrays::dict_test::{gen_primitive_for_dict, gen_varbin_words};
910
use vortex_array::arrays::{ConstantArray, VarBinArray, VarBinViewArray};
11+
use vortex_array::builders::dict::dict_encode;
1012
use vortex_array::compute::{Operator, compare, warm_up_vtables};
11-
use vortex_dict::builders::dict_encode;
12-
use vortex_dict::test::{gen_primitive_for_dict, gen_varbin_words};
1313

1414
fn main() {
1515
warm_up_vtables();

0 commit comments

Comments
 (0)