Skip to content

Commit c55cc47

Browse files
authored
Somewhat dynamic sampling (#2847)
Instead of hard-coding 640 samples (1% of 64K chunk), we try and sample 1% or at least 640 values.
1 parent 4c70980 commit c55cc47

File tree

5 files changed

+22
-11
lines changed

5 files changed

+22
-11
lines changed

vortex-btrblocks/src/float/stats.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ impl CompressorStats for FloatStats {
7070
&self.src
7171
}
7272

73-
fn sample_opts(&self, sample_size: u16, sample_count: u16, opts: GenerateStatsOptions) -> Self {
73+
fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
7474
let sampled = sample(self.src.clone(), sample_size, sample_count)
7575
.to_primitive()
7676
.vortex_expect("primitive");

vortex-btrblocks/src/integer/stats.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ impl CompressorStats for IntegerStats {
142142
&self.src
143143
}
144144

145-
fn sample_opts(&self, sample_size: u16, sample_count: u16, opts: GenerateStatsOptions) -> Self {
145+
fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
146146
let sampled = sample(self.src.clone(), sample_size, sample_count)
147147
.to_primitive()
148148
.vortex_expect("primitive");

vortex-btrblocks/src/lib.rs

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use vortex_array::variants::{ExtensionArrayTrait, PrimitiveArrayTrait, StructArr
99
use vortex_array::{Array, ArrayRef, Canonical};
1010
use vortex_dtype::datetime::TemporalMetadata;
1111
use vortex_dtype::{DType, Nullability};
12-
use vortex_error::{VortexExpect, VortexResult};
12+
use vortex_error::{VortexExpect, VortexResult, VortexUnwrap};
1313

1414
pub use crate::float::FloatCompressor;
1515
pub use crate::integer::IntCompressor;
@@ -39,6 +39,8 @@ impl Default for GenerateStatsOptions {
3939
}
4040
}
4141

42+
const SAMPLE_SIZE: u32 = 64;
43+
4244
/// Stats for the compressor.
4345
pub trait CompressorStats: Debug + Clone {
4446
type ArrayType: Array;
@@ -52,11 +54,11 @@ pub trait CompressorStats: Debug + Clone {
5254

5355
fn source(&self) -> &Self::ArrayType;
5456

55-
fn sample(&self, sample_size: u16, sample_count: u16) -> Self {
57+
fn sample(&self, sample_size: u32, sample_count: u32) -> Self {
5658
self.sample_opts(sample_size, sample_count, GenerateStatsOptions::default())
5759
}
5860

59-
fn sample_opts(&self, sample_size: u16, sample_count: u16, opts: GenerateStatsOptions) -> Self;
61+
fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self;
6062
}
6163

6264
/// Top-level compression scheme trait.
@@ -127,7 +129,16 @@ pub fn estimate_compression_ratio_with_sampling<T: Scheme + ?Sized>(
127129
let sample = if is_sample {
128130
stats.clone()
129131
} else {
130-
stats.sample(64, 10)
132+
// We want to sample about 1% of data
133+
let source_len = stats.source().len();
134+
135+
// We want to sample about 1% of data, while keeping a minimal sample of 640 values.
136+
let sample_count = usize::max(
137+
(source_len / 100) / usize::try_from(SAMPLE_SIZE).vortex_unwrap(),
138+
10,
139+
);
140+
141+
stats.sample(SAMPLE_SIZE, sample_count.try_into().vortex_unwrap())
131142
};
132143

133144
let after = compressor

vortex-btrblocks/src/sample.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use vortex_array::compute::slice;
55
use vortex_array::{Array, ArrayRef};
66
use vortex_error::VortexExpect;
77

8-
pub(crate) fn sample<T: Array + Clone>(input: T, sample_size: u16, sample_count: u16) -> ArrayRef {
8+
pub(crate) fn sample<T: Array + Clone>(input: T, sample_size: u32, sample_count: u32) -> ArrayRef {
99
if input.len() <= (sample_size as usize) * (sample_count as usize) {
1010
return input.to_array();
1111
}
@@ -31,8 +31,8 @@ pub(crate) fn sample<T: Array + Clone>(input: T, sample_size: u16, sample_count:
3131

3232
pub fn stratified_slices(
3333
length: usize,
34-
sample_size: u16,
35-
sample_count: u16,
34+
sample_size: u32,
35+
sample_count: u32,
3636
rng: &mut StdRng,
3737
) -> Vec<(usize, usize)> {
3838
let total_num_samples: usize = (sample_count as usize) * (sample_size as usize);
@@ -62,7 +62,7 @@ pub fn stratified_slices(
6262

6363
/// Split a range of array indices into as-equal-as-possible slices. If the provided `num_partitions` doesn't
6464
/// evenly divide into `length`, then the first `(length % num_partitions)` slices will have an extra element.
65-
pub fn partition_indices(length: usize, num_partitions: u16) -> Vec<(usize, usize)> {
65+
pub fn partition_indices(length: usize, num_partitions: u32) -> Vec<(usize, usize)> {
6666
let num_long_parts = length % num_partitions as usize;
6767
let short_step = length / num_partitions as usize;
6868
let long_step = short_step + 1;

vortex-btrblocks/src/string.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ impl CompressorStats for StringStats {
6868
&self.src
6969
}
7070

71-
fn sample_opts(&self, sample_size: u16, sample_count: u16, opts: GenerateStatsOptions) -> Self {
71+
fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
7272
let sampled = sample(self.src.clone(), sample_size, sample_count)
7373
.to_varbinview()
7474
.vortex_expect("varbinview");

0 commit comments

Comments
 (0)