vortex-data
diff --git a/‎Cargo.lock‎
Lines changed: 53 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 2 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bench-vortex/benches/compress.rs‎
Lines changed: 1 addition & 1 deletion b/‎bench-vortex/benches/compress.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎encodings/fastlanes/src/bitpacking/compress.rs‎
Lines changed: 4 additions & 1 deletion b/‎encodings/fastlanes/src/bitpacking/compress.rs‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎vortex-array/src/array/datetime/mod.rs‎
Lines changed: 5 additions & 0 deletions b/‎vortex-array/src/array/datetime/mod.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎vortex-btrblocks/Cargo.toml‎
Lines changed: 61 additions & 0 deletions b/‎vortex-btrblocks/Cargo.toml‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎vortex-btrblocks/benches/compress.rs‎
Lines changed: 54 additions & 0 deletions b/‎vortex-btrblocks/benches/compress.rs‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎vortex-btrblocks/benches/dict_encode.rs‎
Lines changed: 43 additions & 0 deletions b/‎vortex-btrblocks/benches/dict_encode.rs‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎vortex-btrblocks/benches/stats_calc.rs‎
Lines changed: 82 additions & 0 deletions b/‎vortex-btrblocks/benches/stats_calc.rs‎
Lines changed: 82 additions & 0 deletions
@@ -6,6 +6,7 @@ members = [
     "pyvortex",
     "vortex",
     "vortex-array",
+    "vortex-btrblocks",
     "vortex-buffer",
     "vortex-datafusion",
     "vortex-datetime-dtype",
@@ -165,6 +166,7 @@ witchcraft-metrics = "1.0.1"
 vortex = { version = "0.24.0", path = "./vortex" }
 vortex-alp = { version = "0.24.0", path = "./encodings/alp" }
 vortex-array = { version = "0.24.0", path = "./vortex-array" }
+vortex-btrblocks = { version = "0.24.0", path = "./vortex-btrblocks" }
 vortex-buffer = { version = "0.24.0", path = "./vortex-buffer" }
 vortex-bytebool = { version = "0.24.0", path = "./encodings/bytebool" }
 vortex-datafusion = { version = "0.24.0", path = "./vortex-datafusion" }
 
@@ -175,7 +175,7 @@ fn benchmark_compress<F, U>(
         group.throughput(Throughput::Bytes(uncompressed_size as u64));
         measurement_time.map(|t| group.measurement_time(t));
         group.bench_function(bench_name, |b| {
-            b.iter_with_large_drop(|| {
+            b.iter(|| {
                 compressed_size =
                     vortex_compressed_written_size(runtime, uncompressed.as_ref()).unwrap();
             });
 
@@ -43,7 +43,10 @@ pub fn bitpack_encode(array: PrimitiveArray, bit_width: u8) -> VortexResult<BitP
 
     if bit_width >= array.ptype().bit_width() as u8 {
         // Nothing we can do
-        vortex_bail!("Cannot pack -- specified bit width is greater than or equal to raw bit width")
+        vortex_bail!(
+            "Cannot pack -- specified bit width {bit_width} >= {}",
+            array.ptype().bit_width()
+        )
     }
 
     // SAFETY: we check that array only contains non-negative values.
 
@@ -180,6 +180,11 @@ impl TemporalArray {
     pub fn ext_dtype(&self) -> Arc<ExtDType> {
         self.ext.ext_dtype().clone()
     }
+
+    /// Retrieve the DType of the array. This will be a `DType::Extension` variant.
+    pub fn dtype(&self) -> &DType {
+        self.ext.dtype()
+    }
 }
 
 impl From<TemporalArray> for Array {
 
@@ -0,0 +1,61 @@
+[package]
+name = "vortex-btrblocks"
+description = "BtrBlocks style compressor"
+version.workspace = true
+homepage.workspace = true
+repository.workspace = true
+authors.workspace = true
+license.workspace = true
+keywords.workspace = true
+include.workspace = true
+edition.workspace = true
+rust-version.workspace = true
+readme.workspace = true
+categories.workspace = true
+
+[dependencies]
+arrow-buffer = { workspace = true }
+itertools = { workspace = true }
+log = { workspace = true }
+num-traits = { workspace = true }
+rand = { workspace = true }
+rustc-hash = { workspace = true }
+vortex-array = { workspace = true }
+vortex-alp = { workspace = true }
+vortex-buffer = { workspace = true }
+vortex-datetime-dtype = { workspace = true }
+vortex-datetime-parts = { workspace = true }
+vortex-dict = { workspace = true }
+vortex-dtype = { workspace = true }
+vortex-error = { workspace = true }
+vortex-fastlanes = { workspace = true }
+vortex-fsst = { workspace = true }
+vortex-mask = { workspace = true }
+vortex-scalar = { workspace = true }
+vortex-sparse = { workspace = true }
+vortex-runend = { workspace = true }
+vortex-zigzag = { workspace = true }
+
+[dev-dependencies]
+divan = { workspace = true }
+env_logger = "0.11"
+vortex-sampling-compressor = { workspace = true }
+
+[lints]
+workspace = true
+
+[[bench]]
+name = "compress"
+harness = false
+test = false
+
+[[bench]]
+name = "dict_encode"
+harness = false
+test = false
+
+[[bench]]
+name = "stats_calc"
+harness = false
+test = false
+
@@ -0,0 +1,54 @@
+#![allow(clippy::unwrap_used)]
+
+use divan::counter::{BytesCount, ItemsCount};
+use divan::Bencher;
+use rand::prelude::StdRng;
+use rand::{RngCore, SeedableRng};
+use vortex_array::aliases::hash_set::HashSet;
+use vortex_array::{Array, IntoArray, IntoArrayVariant};
+use vortex_btrblocks::integer::IntCompressor;
+use vortex_btrblocks::Compressor;
+use vortex_buffer::buffer_mut;
+use vortex_sampling_compressor::SamplingCompressor;
+
+fn make_clickbench_window_name() -> Array {
+    // A test that's meant to mirror the WindowName column from ClickBench.
+    let mut values = buffer_mut![-1i32; 1_000_000];
+    let mut visited = HashSet::new();
+    let mut rng = StdRng::seed_from_u64(1u64);
+    while visited.len() < 223 {
+        let random = (rng.next_u32() as usize) % 1_000_000;
+        if visited.contains(&random) {
+            continue;
+        }
+        visited.insert(random);
+        // Pick 100 random values to insert.
+        values[random] = 5 * (rng.next_u64() % 100) as i32;
+    }
+
+    // Ok, now let's compress
+    values.freeze().into_array()
+}
+
+#[divan::bench]
+fn btrblocks(bencher: Bencher) {
+    bencher
+        .with_inputs(|| make_clickbench_window_name().into_primitive().unwrap())
+        .input_counter(|array| ItemsCount::new(array.len()))
+        .input_counter(|array| BytesCount::of_many::<i32>(array.len()))
+        .bench_local_values(|array| IntCompressor::compress(&array, false, 3, &[]).unwrap());
+}
+
+#[divan::bench]
+fn sampling_compressor(bencher: Bencher) {
+    let compressor = SamplingCompressor::default();
+    bencher
+        .with_inputs(make_clickbench_window_name)
+        .input_counter(|array| ItemsCount::new(array.len()))
+        .input_counter(|array| BytesCount::of_many::<i32>(array.len()))
+        .bench_local_values(|array| compressor.compress(&array, None).unwrap());
+}
+
+fn main() {
+    divan::main()
+}
@@ -0,0 +1,43 @@
+#![allow(clippy::unwrap_used)]
+
+use divan::Bencher;
+use vortex_array::array::{BoolArray, PrimitiveArray};
+use vortex_array::validity::Validity;
+use vortex_array::IntoArray;
+use vortex_btrblocks::integer::dictionary::dictionary_encode;
+use vortex_btrblocks::integer::IntegerStats;
+use vortex_btrblocks::CompressorStats;
+use vortex_buffer::BufferMut;
+use vortex_dict::builders::dict_encode;
+
+fn make_array() -> PrimitiveArray {
+    let values: BufferMut<i32> = (0..50).cycle().take(64_000).collect();
+
+    let nulls = BoolArray::from_iter(
+        [true, true, true, true, true, true, false]
+            .into_iter()
+            .cycle()
+            .take(64_000),
+    )
+    .into_array();
+
+    PrimitiveArray::new(values, Validity::Array(nulls))
+}
+
+#[divan::bench]
+fn encode_generic(bencher: Bencher) {
+    bencher
+        .with_inputs(|| make_array().into_array())
+        .bench_local_values(|array| dict_encode(&array).unwrap());
+}
+
+#[divan::bench]
+fn encode_specialized(bencher: Bencher) {
+    bencher
+        .with_inputs(|| IntegerStats::generate(&make_array()))
+        .bench_local_values(|stats| dictionary_encode(&stats).unwrap());
+}
+
+fn main() {
+    divan::main()
+}
@@ -0,0 +1,82 @@
+#![allow(clippy::cast_possible_truncation, clippy::use_debug)]
+
+use vortex_buffer::{Buffer, BufferMut};
+
+fn generate_dataset(max_run: u32, distinct: u32) -> Buffer<u32> {
+    let mut output = BufferMut::with_capacity(64_000);
+    let mut run = 0;
+    let mut value = 0;
+    for _ in 0..64_000 {
+        if run == 0 {
+            value = rand::random::<u32>() % distinct;
+            run = std::cmp::max(rand::random::<u32>() % max_run, 1);
+        }
+        output.push(value);
+        run -= 1;
+    }
+
+    output.freeze()
+}
+
+#[derive(Debug, Copy, Clone)]
+enum Distribution {
+    LowCardinality,
+    ShortRuns,
+    LongRuns,
+}
+
+#[divan::bench_group(items_count = 64_000u32, bytes_count = 256_000u32)]
+mod stats {
+    use divan::Bencher;
+    use vortex_array::array::PrimitiveArray;
+    use vortex_array::validity::Validity;
+    use vortex_btrblocks::integer::IntegerStats;
+    use vortex_btrblocks::{CompressorStats, GenerateStatsOptions};
+    use vortex_buffer::Buffer;
+
+    use crate::{generate_dataset, Distribution};
+
+    fn generate_low_cardinality() -> PrimitiveArray {
+        let values: Buffer<u32> = (0..1024).cycle().take(64_000).collect();
+        PrimitiveArray::new(values, Validity::NonNullable)
+    }
+
+    fn generate_runs(max_run: u32) -> PrimitiveArray {
+        let values = generate_dataset(max_run, 1024);
+        PrimitiveArray::new(values, Validity::NonNullable)
+    }
+
+    #[divan::bench(args = [Distribution::LowCardinality, Distribution::ShortRuns, Distribution::LongRuns])]
+    fn stats_dict_on(bencher: Bencher, distribution: Distribution) {
+        let values = match distribution {
+            Distribution::LowCardinality => generate_low_cardinality(),
+            Distribution::ShortRuns => generate_runs(4),
+            Distribution::LongRuns => generate_runs(64),
+        };
+
+        bencher.with_inputs(|| values.clone()).bench_refs(|values| {
+            IntegerStats::generate_opts(values, GenerateStatsOptions::default());
+        });
+    }
+
+    #[divan::bench(args = [Distribution::LowCardinality, Distribution::ShortRuns, Distribution::LongRuns])]
+    fn stats_dict_off(bencher: Bencher, distribution: Distribution) {
+        let values = match distribution {
+            Distribution::LowCardinality => generate_low_cardinality(),
+            Distribution::ShortRuns => generate_runs(4),
+            Distribution::LongRuns => generate_runs(64),
+        };
+
+        bencher.with_inputs(|| values.clone()).bench_refs(|values| {
+            IntegerStats::generate_opts(
+                values,
+                GenerateStatsOptions {
+                    count_distinct_values: false,
+                },
+            );
+        });
+    }
+}
+fn main() {
+    divan::main();
+}
Original file line number	Diff line number	Diff line change
`@@ -180,6 +180,11 @@ impl TemporalArray {`
`180`	`180`	`pub fn ext_dtype(&self) -> Arc<ExtDType> {`
`181`	`181`	`self.ext.ext_dtype().clone()`
`182`	`182`	`}`
	`183`	`+`
	`184`	+ /// Retrieve the DType of the array. This will be a `DType::Extension` variant.
	`185`	`+ pub fn dtype(&self) -> &DType {`
	`186`	`+ self.ext.dtype()`
	`187`	`+ }`
`183`	`188`	`}`
`184`	`189`
`185`	`190`	`impl From<TemporalArray> for Array {`