Skip to content

Commit 16b312f

Browse files
authored
Revert "Btrblock compress string to constant" (#2837)
Reverts #2830. See #2836 for details.
1 parent fd848ee commit 16b312f

File tree

1 file changed

+13
-92
lines changed

1 file changed

+13
-92
lines changed

vortex-btrblocks/src/string.rs

Lines changed: 13 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
use vortex_array::aliases::hash_set::HashSet;
2-
use vortex_array::arrays::{ConstantArray, VarBinViewArray};
3-
use vortex_array::stats::Precision;
4-
use vortex_array::stats::Stat::IsConstant;
5-
use vortex_array::{Array, ArrayRef, ArrayStatistics, ToCanonical};
2+
use vortex_array::arrays::VarBinViewArray;
3+
use vortex_array::{Array, ArrayRef, ToCanonical};
64
use vortex_dict::DictArray;
75
use vortex_dict::builders::dict_encode;
8-
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
6+
use vortex_error::{VortexExpect, VortexResult};
97
use vortex_fsst::{fsst_compress, fsst_train_compressor};
108

119
use crate::downscale::downscale_integer_array;
@@ -19,35 +17,28 @@ use crate::{
1917
#[derive(Clone, Debug)]
2018
pub struct StringStats {
2119
src: VarBinViewArray,
22-
exact_distinct_count: Option<u32>,
2320
estimated_distinct_count: u32,
2421
value_count: u32,
25-
null_count: u32,
22+
// null_count: u32,
2623
}
2724

2825
/// Estimate the number of distinct strings in the var bin view array.
29-
/// If all are inlined then is count is exact and true is returned.
3026
#[allow(clippy::cast_possible_truncation)]
31-
fn estimate_distinct_count(strings: &VarBinViewArray) -> (u32, bool) {
27+
fn estimate_distinct_count(strings: &VarBinViewArray) -> u32 {
3228
let views = strings.views();
3329
// Iterate the views. Two strings which are equal must have the same first 8-bytes.
3430
// NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
3531
// share a 4-byte prefix and have the same length.
3632
let mut distinct = HashSet::with_capacity(views.len() / 2);
37-
let mut all_inlined = true;
3833
views.iter().for_each(|&view| {
3934
let len_and_prefix = view.as_u128() as u64;
4035
distinct.insert(len_and_prefix);
41-
all_inlined &= view.is_inlined();
4236
});
4337

44-
(
45-
distinct
46-
.len()
47-
.try_into()
48-
.vortex_expect("distinct count must fit in u32"),
49-
all_inlined,
50-
)
38+
distinct
39+
.len()
40+
.try_into()
41+
.vortex_expect("distinct count must fit in u32")
5142
}
5243

5344
impl CompressorStats for StringStats {
@@ -59,22 +50,17 @@ impl CompressorStats for StringStats {
5950
.compute_null_count()
6051
.vortex_expect("null count");
6152
let value_count = input.len() - null_count;
62-
let (estimated_distinct, is_exact) = if opts.count_distinct_values {
53+
let estimated_distinct = if opts.count_distinct_values {
6354
estimate_distinct_count(input)
6455
} else {
65-
(u32::MAX, false)
56+
u32::MAX
6657
};
6758

68-
if is_exact && estimated_distinct == 1 && null_count == 0 {
69-
input.statistics().set(IsConstant, Precision::exact(true));
70-
}
71-
7259
Self {
7360
src: input.clone(),
7461
value_count: value_count.try_into().vortex_expect("value_count"),
75-
null_count: null_count.try_into().vortex_expect("null_count"),
62+
// null_count: null_count.try_into().vortex_expect("null_count"),
7663
estimated_distinct_count: estimated_distinct,
77-
exact_distinct_count: is_exact.then_some(estimated_distinct),
7864
}
7965
}
8066

@@ -99,12 +85,7 @@ impl Compressor for StringCompressor {
9985
type StatsType = StringStats;
10086

10187
fn schemes() -> &'static [&'static Self::SchemeType] {
102-
&[
103-
&UncompressedScheme,
104-
&ConstantScheme,
105-
&DictScheme,
106-
&FSSTScheme,
107-
]
88+
&[&UncompressedScheme, &DictScheme, &FSSTScheme]
10889
}
10990

11091
fn default_scheme() -> &'static Self::SchemeType {
@@ -123,9 +104,6 @@ impl<T> StringScheme for T where T: Scheme<StatsType = StringStats, CodeType = S
123104
#[derive(Debug, Copy, Clone)]
124105
pub struct UncompressedScheme;
125106

126-
#[derive(Debug, Copy, Clone)]
127-
pub struct ConstantScheme;
128-
129107
#[derive(Debug, Copy, Clone)]
130108
pub struct DictScheme;
131109

@@ -138,7 +116,6 @@ pub struct StringCode(u8);
138116
const UNCOMPRESSED_SCHEME: StringCode = StringCode(0);
139117
const DICT_SCHEME: StringCode = StringCode(1);
140118
const FSST_SCHEME: StringCode = StringCode(2);
141-
const CONSTANT_SCHEME: StringCode = StringCode(3);
142119

143120
impl Scheme for UncompressedScheme {
144121
type StatsType = StringStats;
@@ -169,62 +146,6 @@ impl Scheme for UncompressedScheme {
169146
}
170147
}
171148

172-
impl Scheme for ConstantScheme {
173-
type StatsType = StringStats;
174-
type CodeType = StringCode;
175-
176-
fn code(&self) -> Self::CodeType {
177-
CONSTANT_SCHEME
178-
}
179-
180-
fn is_constant(&self) -> bool {
181-
true
182-
}
183-
184-
fn expected_compression_ratio(
185-
&self,
186-
stats: &StringStats,
187-
is_sample: bool,
188-
_allowed_cascading: usize,
189-
_excludes: &[Self::CodeType],
190-
) -> VortexResult<f64> {
191-
// Never yield ConstantScheme for a sample, it could be a false-positive.
192-
if is_sample {
193-
return Ok(0.0);
194-
}
195-
196-
// Only arrays with one distinct values can be constant compressed.
197-
if stats.exact_distinct_count.unwrap_or(stats.value_count) != 1 {
198-
return Ok(0.0);
199-
}
200-
201-
// Cannot have mix of nulls and non-nulls
202-
if stats.null_count > 0 && stats.value_count > 0 {
203-
return Ok(0.0);
204-
}
205-
206-
Ok(stats.value_count as f64)
207-
}
208-
209-
fn compress(
210-
&self,
211-
stats: &Self::StatsType,
212-
_is_sample: bool,
213-
_allowed_cascading: usize,
214-
_excludes: &[Self::CodeType],
215-
) -> VortexResult<ArrayRef> {
216-
if stats.source().statistics().get_as::<bool>(IsConstant) != Some(Precision::exact(true)) {
217-
vortex_bail!("array was thought to be constant, but it is not");
218-
}
219-
let scalar = stats
220-
.source()
221-
.as_constant()
222-
.vortex_expect("constant array expected");
223-
224-
Ok(ConstantArray::new(scalar, stats.src.len()).into_array())
225-
}
226-
}
227-
228149
impl Scheme for DictScheme {
229150
type StatsType = StringStats;
230151
type CodeType = StringCode;

0 commit comments

Comments
 (0)