|
1 | 1 | use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion, Throughput}; |
2 | 2 | use itertools::Itertools as _; |
3 | 3 | use mimalloc::MiMalloc; |
4 | | -use rand::{Rng, SeedableRng as _}; |
| 4 | +use rand::distributions::Alphanumeric; |
| 5 | +use rand::seq::SliceRandom as _; |
| 6 | +use rand::{thread_rng, Rng, SeedableRng as _}; |
5 | 7 | use vortex::aliases::hash_set::HashSet; |
6 | | -use vortex::array::PrimitiveArray; |
| 8 | +use vortex::array::{PrimitiveArray, VarBinViewArray}; |
7 | 9 | use vortex::compute::unary::try_cast; |
| 10 | +use vortex::dict::{dict_encode_varbinview, DictArray}; |
8 | 11 | use vortex::dtype::PType; |
| 12 | +use vortex::fsst::{fsst_compress, fsst_train_compressor}; |
9 | 13 | use vortex::sampling_compressor::compressors::alp::ALPCompressor; |
10 | 14 | use vortex::sampling_compressor::compressors::alp_rd::ALPRDCompressor; |
11 | 15 | use vortex::sampling_compressor::compressors::bitpacked::{ |
@@ -92,5 +96,51 @@ fn primitive(c: &mut Criterion) { |
92 | 96 | } |
93 | 97 | } |
94 | 98 |
|
95 | | -criterion_group!(benches, primitive); |
| 99 | +fn strings(c: &mut Criterion) { |
| 100 | + let mut group = c.benchmark_group("string-decompression"); |
| 101 | + let num_values = u16::MAX as u64; |
| 102 | + group.throughput(Throughput::Bytes(num_values * 8)); |
| 103 | + |
| 104 | + let varbinview_arr = VarBinViewArray::from_iter_str(gen_varbin_words(1_000_000, 0.00005)); |
| 105 | + let (codes, values) = dict_encode_varbinview(&varbinview_arr); |
| 106 | + group.throughput(Throughput::Bytes( |
| 107 | + varbinview_arr.clone().into_array().nbytes() as u64, |
| 108 | + )); |
| 109 | + group.bench_function("dict_decode_varbinview", |b| { |
| 110 | + b.iter_batched( |
| 111 | + || DictArray::try_new(codes.clone().into_array(), values.clone().into_array()).unwrap(), |
| 112 | + |dict_arr| black_box(dict_arr.into_canonical().unwrap()), |
| 113 | + BatchSize::SmallInput, |
| 114 | + ); |
| 115 | + }); |
| 116 | + |
| 117 | + let fsst_compressor = fsst_train_compressor(&varbinview_arr.clone().into_array()).unwrap(); |
| 118 | + let fsst_array = fsst_compress(&varbinview_arr.clone().into_array(), &fsst_compressor).unwrap(); |
| 119 | + group.bench_function("fsst_decompress_varbinview", |b| { |
| 120 | + b.iter_batched( |
| 121 | + || fsst_array.clone(), |
| 122 | + |fsst_arr| black_box(fsst_arr.into_canonical().unwrap()), |
| 123 | + BatchSize::SmallInput, |
| 124 | + ); |
| 125 | + }); |
| 126 | +} |
| 127 | + |
| 128 | +fn gen_varbin_words(len: usize, uniqueness: f64) -> Vec<String> { |
| 129 | + let mut rng = thread_rng(); |
| 130 | + let uniq_cnt = (len as f64 * uniqueness) as usize; |
| 131 | + let dict: Vec<String> = (0..uniq_cnt) |
| 132 | + .map(|_| { |
| 133 | + (&mut rng) |
| 134 | + .sample_iter(&Alphanumeric) |
| 135 | + .take(8) |
| 136 | + .map(char::from) |
| 137 | + .collect() |
| 138 | + }) |
| 139 | + .collect(); |
| 140 | + (0..len) |
| 141 | + .map(|_| dict.choose(&mut rng).unwrap().clone()) |
| 142 | + .collect() |
| 143 | +} |
| 144 | + |
| 145 | +criterion_group!(benches, primitive, strings); |
96 | 146 | criterion_main!(benches); |
0 commit comments