Skip to content

Commit 1388625

Browse files
authored
fix: keep column statistics of all NULL column (#16753)
* fix: keep column statistics of all NULL column * add logic test * fix typos reported by typos-cli
1 parent 7f54d48 commit 1388625

File tree

6 files changed

+74
-11
lines changed

6 files changed

+74
-11
lines changed

src/common/arrow/src/arrow/compute/merge_sort/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ pub fn build_comparator<'a>(
491491
}
492492

493493
/// returns a comparison function between any two arrays of each pair of arrays, according to `SortOptions`.
494-
/// Implementing custom `build_compare_fn` for unsupportd data types.
494+
/// Implementing custom `build_compare_fn` for unsupported data types.
495495
pub fn build_comparator_impl<'a>(
496496
pairs: &'a [(&'a [&'a dyn Array], &SortOptions)],
497497
build_compare_fn: &dyn Fn(&dyn Array, &dyn Array) -> Result<DynComparator>,

src/common/arrow/src/arrow/compute/sort/lex_sort.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ pub fn lexsort_to_indices<I: Index>(
175175

176176
/// Sorts a list of [`SortColumn`] into a non-nullable [`PrimitiveArray`]
177177
/// representing the indices that would sort the columns.
178-
/// Implementing custom `build_compare_fn` for unsupportd data types.
178+
/// Implementing custom `build_compare_fn` for unsupported data types.
179179
pub fn lexsort_to_indices_impl<I: Index>(
180180
columns: &[SortColumn],
181181
limit: Option<usize>,

src/common/io/src/cursor_ext/cursor_read_bytes_ext.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ where T: AsRef<[u8]>
112112
if available.is_empty() {
113113
return 0;
114114
}
115-
for (index, byt) in available.iter().enumerate() {
116-
if !f(*byt) {
115+
for (index, bytes) in available.iter().enumerate() {
116+
if !f(*bytes) {
117117
self.consume(index);
118118
return index;
119119
}

src/query/script/src/ir.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ pub enum ScriptIR {
8484
condition: VarRef,
8585
to_label: LabelRef,
8686
},
87-
/// Uncoditionally jumps to a specified label.
87+
/// Unconditionally jumps to a specified label.
8888
Goto { to_label: LabelRef },
8989
/// Returns from the script.
9090
Return,

src/query/storages/common/table_meta/src/meta/v2/statistics.rs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -393,14 +393,24 @@ impl<'de> serde::de::Visitor<'de> for ColStatsVisitor {
393393

394394
while let Some(key) = access.next_key::<ColumnId>()? {
395395
if let Ok(value) = access.next_value::<ColumnStatistics>() {
396-
let data_type = value.max.as_ref().infer_data_type();
397-
if supported_stat_type(&data_type) {
396+
if value.max.is_null() && value.min.is_null() {
397+
// If scalar values of min and max are all NULL, they should be retained.
398+
//
399+
// This ensures that columns with only NULL values have their column statistics
400+
// recorded, which is essential for pruning on these columns, and without this,
401+
// column statistics like NDV (Number of Distinct Values) and null_count
402+
// would be missing as well.
398403
map.insert(key, value);
399404
} else {
400-
info!(
401-
"column of id {} is excluded from column statistics, unsupported data type {}",
402-
key, data_type
403-
);
405+
let data_type = value.max.as_ref().infer_data_type();
406+
if supported_stat_type(&data_type) {
407+
map.insert(key, value);
408+
} else {
409+
info!(
410+
"column of id {} is excluded from column statistics, unsupported data type {}",
411+
key, data_type
412+
);
413+
}
404414
}
405415
}
406416
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
statement ok
2+
create or replace database col_stats_all_null;
3+
4+
statement ok
5+
use col_stats_all_null;
6+
7+
8+
statement ok
9+
create or replace table t(c int) STORAGE_FORMAT=parquet;
10+
11+
statement ok
12+
insert into t values(NULL);
13+
14+
# segments should be pruned
15+
query T
16+
explain select * from t where c > 6;
17+
----
18+
Filter
19+
├── output columns: [t.c (#0)]
20+
├── filters: [is_true(t.c (#0) > 6)]
21+
├── estimated rows: 0.00
22+
└── TableScan
23+
├── table: default.col_stats_all_null.t
24+
├── output columns: [c (#0)]
25+
├── read rows: 0
26+
├── read size: 0
27+
├── partitions total: 1
28+
├── partitions scanned: 0
29+
├── pruning stats: [segments: <range pruning: 1 to 0>]
30+
├── push downs: [filters: [is_true(t.c (#0) > 6)], limit: NONE]
31+
└── estimated rows: 1.00
32+
33+
34+
statement ok
35+
create or replace table t(c int) STORAGE_FORMAT=native;
36+
37+
statement ok
38+
insert into t values(NULL);
39+
40+
# segments should be pruned
41+
query T
42+
explain select * from t where c > 6;
43+
----
44+
TableScan
45+
├── table: default.col_stats_all_null.t
46+
├── output columns: [c (#0)]
47+
├── read rows: 0
48+
├── read size: 0
49+
├── partitions total: 1
50+
├── partitions scanned: 0
51+
├── pruning stats: [segments: <range pruning: 1 to 0>]
52+
├── push downs: [filters: [is_true(t.c (#0) > 6)], limit: NONE]
53+
└── estimated rows: 0.00

0 commit comments

Comments
 (0)