Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions vortex-btrblocks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ rust-version = { workspace = true }
version = { workspace = true }

[dependencies]
arrow-schema = { workspace = true }
getrandom_v03 = { workspace = true }
itertools = { workspace = true }
log = { workspace = true }
Expand Down
53 changes: 53 additions & 0 deletions vortex-btrblocks/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use vortex_array::arrays::{
use vortex_array::builders::dict::dict_encode;
use vortex_array::vtable::ValidityHelper;
use vortex_array::{ArrayRef, IntoArray, ToCanonical};
use vortex_dtype::DType;
use vortex_error::{VortexExpect, VortexResult};
use vortex_fsst::{FSSTArray, fsst_compress, fsst_train_compressor};
use vortex_scalar::Scalar;
Expand Down Expand Up @@ -93,6 +94,7 @@ impl Compressor for StringCompressor {
fn schemes() -> &'static [&'static Self::SchemeType] {
&[
&UncompressedScheme,
&VarBinScheme,
&DictScheme,
&FSSTScheme,
&ConstantScheme,
Expand All @@ -116,6 +118,9 @@ impl<T> StringScheme for T where T: Scheme<StatsType = StringStats, CodeType = S
#[derive(Debug, Copy, Clone)]
pub struct UncompressedScheme;

#[derive(Debug, Copy, Clone)]
pub struct VarBinScheme;

#[derive(Debug, Copy, Clone)]
pub struct DictScheme;

Expand All @@ -137,6 +142,7 @@ const FSST_SCHEME: StringCode = StringCode(2);
const CONSTANT_SCHEME: StringCode = StringCode(3);

const SPARSE_SCHEME: StringCode = StringCode(4);
const VARBIN_SCHEME: StringCode = StringCode(5);

impl Scheme for UncompressedScheme {
type StatsType = StringStats;
Expand Down Expand Up @@ -167,6 +173,53 @@ impl Scheme for UncompressedScheme {
}
}

impl Scheme for VarBinScheme {
type StatsType = StringStats;
type CodeType = StringCode;

fn code(&self) -> StringCode {
VARBIN_SCHEME
}

fn expected_compression_ratio(
&self,
stats: &Self::StatsType,
is_sample: bool,
allowed_cascading: usize,
excludes: &[StringCode],
) -> VortexResult<f64> {
estimate_compression_ratio_with_sampling(
self,
stats,
is_sample,
allowed_cascading,
excludes,
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can actually estimate this pretty easily by iterating the views, counting total bytes + offsets. That saves us actually having to materialize

}

fn compress(
&self,
stats: &Self::StatsType,
_is_sample: bool,
_allowed_cascading: usize,
_excludes: &[StringCode],
) -> VortexResult<ArrayRef> {
use vortex_array::arrow::{FromArrowArray, IntoArrowArray};

let arrow_dtype = match stats.src.dtype() {
DType::Utf8(..) => arrow_schema::DataType::Utf8,
DType::Binary(..) => arrow_schema::DataType::Binary,
_ => unreachable!("VarBinView must be Utf8 or Binary"),
};

// Convert VarBinView -> Arrow VarBin -> Vortex VarBin
let arrow_array = stats.source().to_array().into_arrow(&arrow_dtype)?;
let nullable = stats.source().dtype().is_nullable();

Ok(ArrayRef::from_arrow(arrow_array.as_ref(), nullable))
}
}

impl Scheme for DictScheme {
type StatsType = StringStats;
type CodeType = StringCode;
Expand Down
Loading