Skip to content

Commit b2fd93a

Browse files
authored
fix: collect stats fail with string starts with large unicodes. (#18523)
1 parent 80deeed commit b2fd93a

File tree

4 files changed

+58
-7
lines changed

4 files changed

+58
-7
lines changed

src/query/service/tests/it/storages/fuse/statistics.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ use databend_common_sql::ApproxDistinctColumns;
4343
use databend_common_storages_fuse::io::build_column_hlls;
4444
use databend_common_storages_fuse::statistics::reducers::reduce_block_metas;
4545
use databend_common_storages_fuse::statistics::Trim;
46-
use databend_common_storages_fuse::statistics::STATS_REPLACEMENT_CHAR;
46+
use databend_common_storages_fuse::statistics::END_OF_UNICODE_RANGE;
4747
use databend_common_storages_fuse::statistics::STATS_STRING_PREFIX_LEN;
4848
use databend_common_storages_fuse::FuseStorageFormat;
4949
use databend_query::storages::fuse::io::TableMetaLocationGenerator;
@@ -593,7 +593,7 @@ fn is_degenerated_case(value: &str) -> bool {
593593
let prefixed_with_irreplaceable_chars = value
594594
.chars()
595595
.take(STATS_STRING_PREFIX_LEN)
596-
.all(|c| c >= STATS_REPLACEMENT_CHAR);
596+
.all(|c| c >= END_OF_UNICODE_RANGE);
597597

598598
larger_than_prefix_len && prefixed_with_irreplaceable_chars
599599
}

src/query/storages/fuse/src/io/write/stream/column_statistics_builder.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,9 @@ where
340340
if let Some(v) = T::upcast_scalar_with_type(v, &self.data_type).trim_max() {
341341
v
342342
} else {
343-
return Err(ErrorCode::Internal("Unable to trim string"));
343+
return Err(ErrorCode::Internal(
344+
"Unable to trim string: first 16 chars are all replacement_point".to_string(),
345+
));
344346
}
345347
} else {
346348
Scalar::Null

src/query/storages/fuse/src/statistics/column_statistic.rs

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ pub trait Trim: Sized {
168168
fn may_be_trimmed(&self) -> bool;
169169
}
170170

171-
pub const STATS_REPLACEMENT_CHAR: char = '\u{FFFD}';
171+
pub const END_OF_UNICODE_RANGE: char = '\u{10FFFF}';
172172
pub const STATS_STRING_PREFIX_LEN: usize = 16;
173173

174174
impl Trim for Scalar {
@@ -220,7 +220,7 @@ impl Trim for Scalar {
220220
// in reversed order, break at the first one we met
221221
let mut idx = None;
222222
for (i, c) in sliced.char_indices().rev() {
223-
if c < STATS_REPLACEMENT_CHAR {
223+
if c < END_OF_UNICODE_RANGE {
224224
idx = Some(i);
225225
break;
226226
}
@@ -235,7 +235,7 @@ impl Trim for Scalar {
235235
if i < replacement_point {
236236
r.push(c)
237237
} else {
238-
r.push(STATS_REPLACEMENT_CHAR);
238+
r.push(END_OF_UNICODE_RANGE);
239239
}
240240
}
241241

@@ -253,3 +253,52 @@ impl Trim for Scalar {
253253
}
254254
}
255255
}
256+
257+
#[cfg(test)]
258+
mod tests {
259+
use databend_common_expression::Scalar;
260+
261+
use crate::statistics::Trim;
262+
use crate::statistics::END_OF_UNICODE_RANGE;
263+
use crate::statistics::STATS_STRING_PREFIX_LEN;
264+
265+
#[test]
266+
fn test_trim_max() {
267+
{
268+
let invalid = END_OF_UNICODE_RANGE
269+
.to_string()
270+
.repeat(STATS_STRING_PREFIX_LEN + 1);
271+
assert_eq!(Scalar::String(invalid).trim_max(), None);
272+
}
273+
274+
{
275+
let s = END_OF_UNICODE_RANGE
276+
.to_string()
277+
.repeat(STATS_STRING_PREFIX_LEN);
278+
let scalar = Scalar::String(s.clone());
279+
assert_eq!(scalar.clone().trim_max(), Some(scalar));
280+
}
281+
282+
{
283+
let s = '👍'.to_string().repeat(STATS_STRING_PREFIX_LEN + 1);
284+
let res = Scalar::String(s.clone()).trim_max();
285+
let mut exp = '👍'.to_string().repeat(STATS_STRING_PREFIX_LEN - 1);
286+
exp.push(END_OF_UNICODE_RANGE);
287+
assert_eq!(res, Some(Scalar::String(exp)));
288+
}
289+
290+
{
291+
let mut s = '👍'.to_string().repeat(STATS_STRING_PREFIX_LEN - 1);
292+
s.push(END_OF_UNICODE_RANGE);
293+
s.push(END_OF_UNICODE_RANGE);
294+
295+
let res = Scalar::String(s.clone()).trim_max();
296+
297+
let mut exp = '👍'.to_string().repeat(STATS_STRING_PREFIX_LEN - 2);
298+
exp.push(END_OF_UNICODE_RANGE);
299+
exp.push(END_OF_UNICODE_RANGE);
300+
301+
assert_eq!(res, Some(Scalar::String(exp)));
302+
}
303+
}
304+
}

src/query/storages/fuse/src/statistics/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pub use column_statistic::calc_column_distinct_of_values;
2727
pub use column_statistic::gen_columns_statistics;
2828
pub use column_statistic::scalar_min_max;
2929
pub use column_statistic::Trim;
30-
pub use column_statistic::STATS_REPLACEMENT_CHAR;
30+
pub use column_statistic::END_OF_UNICODE_RANGE;
3131
pub use column_statistic::STATS_STRING_PREFIX_LEN;
3232
pub use reducers::merge_statistics;
3333
pub use reducers::reduce_block_metas;

0 commit comments

Comments
 (0)