Skip to content

Commit 69a7529

Browse files
committed
feat: impl CpcWrapper
Signed-off-by: tison <wander4096@gmail.com>
1 parent 6546d66 commit 69a7529

File tree

7 files changed

+373
-62
lines changed

7 files changed

+373
-62
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ All significant changes to this project will be documented in this file.
1313

1414
* `CountMinSketch` with unsigned values now supports `halve` and `decay` operations.
1515
* `CpcSketch` and `CpcUnion` are now available for cardinality estimation.
16+
* `CpcWrapper` is now available for read `CpcSketch`'s estimation from a serialized sketch without full deserialization.
1617
* `FrequentItemsSketch` now supports serde for any value implement `FrequentItemValue` (builtin supports for `i64`, `u64`, and `String`).
1718
* Expose `codec::SketchBytes`, `codec::SketchSlice`, and `FrequentItemValue` as public API.
1819

datasketches/src/cpc/estimator.rs

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,43 @@ static HIP_HIGH_SIDE_DATA: [u16; 33] = [
8888
5880, 5914, 5953, // 14 1000297
8989
];
9090

91-
pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
91+
pub(super) fn estimate(merge_flag: bool, hip_est_accum: f64, lg_k: u8, num_coupons: u32) -> f64 {
92+
if !merge_flag {
93+
hip_est_accum
94+
} else {
95+
icon_estimate(lg_k, num_coupons)
96+
}
97+
}
98+
99+
pub(super) fn lower_bound(
100+
merge_flag: bool,
101+
hip_est_accum: f64,
102+
lg_k: u8,
103+
num_coupons: u32,
104+
kappa: NumStdDev,
105+
) -> f64 {
106+
if !merge_flag {
107+
hip_confidence_lb(lg_k, num_coupons, hip_est_accum, kappa)
108+
} else {
109+
icon_confidence_lb(lg_k, num_coupons, kappa)
110+
}
111+
}
112+
113+
pub(super) fn upper_bound(
114+
merge_flag: bool,
115+
hip_est_accum: f64,
116+
lg_k: u8,
117+
num_coupons: u32,
118+
kappa: NumStdDev,
119+
) -> f64 {
120+
if !merge_flag {
121+
hip_confidence_ub(lg_k, num_coupons, hip_est_accum, kappa)
122+
} else {
123+
icon_confidence_ub(lg_k, num_coupons, kappa)
124+
}
125+
}
126+
127+
fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
92128
if num_coupons == 0 {
93129
return 0.0;
94130
}
@@ -112,7 +148,7 @@ pub(super) fn icon_confidence_lb(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -
112148
}
113149
}
114150

115-
pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
151+
fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -> f64 {
116152
if num_coupons == 0 {
117153
return 0.0;
118154
}
@@ -132,12 +168,7 @@ pub(super) fn icon_confidence_ub(lg_k: u8, num_coupons: u32, kappa: NumStdDev) -
132168
result.ceil() // slight widening of interval to be conservative
133169
}
134170

135-
pub(super) fn hip_confidence_lb(
136-
lg_k: u8,
137-
num_coupons: u32,
138-
hip_est_accum: f64,
139-
kappa: NumStdDev,
140-
) -> f64 {
171+
fn hip_confidence_lb(lg_k: u8, num_coupons: u32, hip_est_accum: f64, kappa: NumStdDev) -> f64 {
141172
if num_coupons == 0 {
142173
return 0.0;
143174
}
@@ -160,12 +191,7 @@ pub(super) fn hip_confidence_lb(
160191
}
161192
}
162193

163-
pub(super) fn hip_confidence_ub(
164-
lg_k: u8,
165-
num_coupons: u32,
166-
hip_est_accum: f64,
167-
kappa: NumStdDev,
168-
) -> f64 {
194+
fn hip_confidence_ub(lg_k: u8, num_coupons: u32, hip_est_accum: f64, kappa: NumStdDev) -> f64 {
169195
if num_coupons == 0 {
170196
return 0.0;
171197
}
@@ -362,7 +388,7 @@ fn icon_exponential_approximation(k: f64, c: f64) -> f64 {
362388
0.7940236163830469 * k * 2f64.powf(c / k)
363389
}
364390

365-
pub(super) fn icon_estimate(lg_k: u8, num_coupons: u32) -> f64 {
391+
fn icon_estimate(lg_k: u8, num_coupons: u32) -> f64 {
366392
let lg_k = lg_k as usize;
367393
assert!(
368394
(ICON_MIN_LOG_K..=ICON_MAX_LOG_K).contains(&lg_k),

datasketches/src/cpc/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,14 @@ mod compression_data;
4040
mod estimator;
4141
mod kxp_byte_lookup;
4242
mod pair_table;
43+
mod serialization;
4344
mod sketch;
4445
mod union;
46+
mod wrapper;
4547

4648
pub use self::sketch::CpcSketch;
4749
pub use self::union::CpcUnion;
50+
pub use self::wrapper::CpcWrapper;
4851

4952
/// Default log2 of K.
5053
const DEFAULT_LG_K: u8 = 11;
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
pub(super) const SERIAL_VERSION: u8 = 1;
19+
pub(super) const FLAG_COMPRESSED: u8 = 1;
20+
pub(super) const FLAG_HAS_HIP: u8 = 2;
21+
pub(super) const FLAG_HAS_TABLE: u8 = 3;
22+
pub(super) const FLAG_HAS_WINDOW: u8 = 4;
23+
24+
pub(super) fn make_preamble_ints(
25+
num_coupons: u32,
26+
has_hip: bool,
27+
has_table: bool,
28+
has_window: bool,
29+
) -> u8 {
30+
let mut preamble_ints = 2;
31+
if num_coupons > 0 {
32+
preamble_ints += 1; // number of coupons
33+
if has_hip {
34+
preamble_ints += 4; // HIP
35+
}
36+
if has_table {
37+
preamble_ints += 1; // table data length
38+
// number of values (if there is no window it is the same as number of coupons)
39+
if has_window {
40+
preamble_ints += 1;
41+
}
42+
}
43+
if has_window {
44+
preamble_ints += 1; // window length
45+
}
46+
}
47+
preamble_ints
48+
}

datasketches/src/cpc/sketch.rs

Lines changed: 29 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,17 @@ use crate::cpc::compression::CompressedState;
3333
use crate::cpc::count_bits_set_in_matrix;
3434
use crate::cpc::determine_correct_offset;
3535
use crate::cpc::determine_flavor;
36-
use crate::cpc::estimator::hip_confidence_lb;
37-
use crate::cpc::estimator::hip_confidence_ub;
38-
use crate::cpc::estimator::icon_confidence_lb;
39-
use crate::cpc::estimator::icon_confidence_ub;
40-
use crate::cpc::estimator::icon_estimate;
36+
use crate::cpc::estimator::estimate;
37+
use crate::cpc::estimator::lower_bound;
38+
use crate::cpc::estimator::upper_bound;
4139
use crate::cpc::kxp_byte_lookup::KXP_BYTE_TABLE;
4240
use crate::cpc::pair_table::PairTable;
41+
use crate::cpc::serialization::FLAG_COMPRESSED;
42+
use crate::cpc::serialization::FLAG_HAS_HIP;
43+
use crate::cpc::serialization::FLAG_HAS_TABLE;
44+
use crate::cpc::serialization::FLAG_HAS_WINDOW;
45+
use crate::cpc::serialization::SERIAL_VERSION;
46+
use crate::cpc::serialization::make_preamble_ints;
4347
use crate::error::Error;
4448
use crate::error::ErrorKind;
4549
use crate::hash::DEFAULT_UPDATE_SEED;
@@ -130,29 +134,34 @@ impl CpcSketch {
130134

131135
/// Returns the best estimate of the cardinality of the sketch.
132136
pub fn estimate(&self) -> f64 {
133-
if !self.merge_flag {
134-
self.hip_est_accum
135-
} else {
136-
icon_estimate(self.lg_k, self.num_coupons)
137-
}
137+
estimate(
138+
self.merge_flag,
139+
self.hip_est_accum,
140+
self.lg_k,
141+
self.num_coupons,
142+
)
138143
}
139144

140145
/// Returns the best estimate of the lower bound of the confidence interval given `kappa`.
141146
pub fn lower_bound(&self, kappa: NumStdDev) -> f64 {
142-
if !self.merge_flag {
143-
hip_confidence_lb(self.lg_k, self.num_coupons, self.hip_est_accum, kappa)
144-
} else {
145-
icon_confidence_lb(self.lg_k, self.num_coupons, kappa)
146-
}
147+
lower_bound(
148+
self.merge_flag,
149+
self.hip_est_accum,
150+
self.lg_k,
151+
self.num_coupons,
152+
kappa,
153+
)
147154
}
148155

149156
/// Returns the best estimate of the upper bound of the confidence interval given `kappa`.
150157
pub fn upper_bound(&self, kappa: NumStdDev) -> f64 {
151-
if !self.merge_flag {
152-
hip_confidence_ub(self.lg_k, self.num_coupons, self.hip_est_accum, kappa)
153-
} else {
154-
icon_confidence_ub(self.lg_k, self.num_coupons, kappa)
155-
}
158+
upper_bound(
159+
self.merge_flag,
160+
self.hip_est_accum,
161+
self.lg_k,
162+
self.num_coupons,
163+
kappa,
164+
)
156165
}
157166

158167
/// Returns true if the sketch is empty.
@@ -437,12 +446,6 @@ impl CpcSketch {
437446
}
438447
}
439448

440-
const SERIAL_VERSION: u8 = 1;
441-
const FLAG_COMPRESSED: u8 = 1;
442-
const FLAG_HAS_HIP: u8 = 2;
443-
const FLAG_HAS_TABLE: u8 = 3;
444-
const FLAG_HAS_WINDOW: u8 = 4;
445-
446449
impl CpcSketch {
447450
/// Serializes this CpcSketch to bytes.
448451
pub fn serialize(&self) -> Vec<u8> {
@@ -637,27 +640,6 @@ impl CpcSketch {
637640
}
638641
}
639642

640-
fn make_preamble_ints(num_coupons: u32, has_hip: bool, has_table: bool, has_window: bool) -> u8 {
641-
let mut preamble_ints = 2;
642-
if num_coupons > 0 {
643-
preamble_ints += 1; // number of coupons
644-
if has_hip {
645-
preamble_ints += 4; // HIP
646-
}
647-
if has_table {
648-
preamble_ints += 1; // table data length
649-
// number of values (if there is no window it is the same as number of coupons)
650-
if has_window {
651-
preamble_ints += 1;
652-
}
653-
}
654-
if has_window {
655-
preamble_ints += 1; // window length
656-
}
657-
}
658-
preamble_ints
659-
}
660-
661643
impl CpcSketch {
662644
/// Returns the estimated maximum compressed serialized size of a sketch.
663645
///

0 commit comments

Comments
 (0)