fix: keep column statistics of all NULL column (#16753)

dantengsky · web-flow · commit 138862533014 · 2024-11-02T11:32:31.000Z
* fix: keep column statistics of all NULL column

* add logic test

* fix typos reported by typos-cli
diff --git a/src/common/arrow/src/arrow/compute/merge_sort/mod.rs b/src/common/arrow/src/arrow/compute/merge_sort/mod.rs
@@ -491,7 +491,7 @@ pub fn build_comparator<'a>(
 }
 
 /// returns a comparison function between any two arrays of each pair of arrays, according to `SortOptions`.
-/// Implementing custom `build_compare_fn` for unsupportd data types.
+/// Implementing custom `build_compare_fn` for unsupported data types.
 pub fn build_comparator_impl<'a>(
     pairs: &'a [(&'a [&'a dyn Array], &SortOptions)],
     build_compare_fn: &dyn Fn(&dyn Array, &dyn Array) -> Result<DynComparator>,
diff --git a/src/common/arrow/src/arrow/compute/sort/lex_sort.rs b/src/common/arrow/src/arrow/compute/sort/lex_sort.rs
@@ -175,7 +175,7 @@ pub fn lexsort_to_indices<I: Index>(
 
 /// Sorts a list of [`SortColumn`] into a non-nullable [`PrimitiveArray`]
 /// representing the indices that would sort the columns.
-/// Implementing custom `build_compare_fn` for unsupportd data types.
+/// Implementing custom `build_compare_fn` for unsupported data types.
 pub fn lexsort_to_indices_impl<I: Index>(
     columns: &[SortColumn],
     limit: Option<usize>,
diff --git a/src/common/io/src/cursor_ext/cursor_read_bytes_ext.rs b/src/common/io/src/cursor_ext/cursor_read_bytes_ext.rs
@@ -112,8 +112,8 @@ where T: AsRef<[u8]>
         if available.is_empty() {
             return 0;
         }
-        for (index, byt) in available.iter().enumerate() {
-            if !f(*byt) {
+        for (index, bytes) in available.iter().enumerate() {
+            if !f(*bytes) {
                 self.consume(index);
                 return index;
             }
diff --git a/src/query/script/src/ir.rs b/src/query/script/src/ir.rs
@@ -84,7 +84,7 @@ pub enum ScriptIR {
         condition: VarRef,
         to_label: LabelRef,
     },
-    /// Uncoditionally jumps to a specified label.
+    /// Unconditionally jumps to a specified label.
     Goto { to_label: LabelRef },
     /// Returns from the script.
     Return,
diff --git a/src/query/storages/common/table_meta/src/meta/v2/statistics.rs b/src/query/storages/common/table_meta/src/meta/v2/statistics.rs
@@ -393,14 +393,24 @@ impl<'de> serde::de::Visitor<'de> for ColStatsVisitor {
 
         while let Some(key) = access.next_key::<ColumnId>()? {
             if let Ok(value) = access.next_value::<ColumnStatistics>() {
-                let data_type = value.max.as_ref().infer_data_type();
-                if supported_stat_type(&data_type) {
+                if value.max.is_null() && value.min.is_null() {
+                    // If scalar values of min and max are all NULL, they should be retained.
+                    //
+                    // This ensures that columns with only NULL values have their column statistics
+                    // recorded, which is essential for pruning on these columns, and without this,
+                    // column statistics like NDV (Number of Distinct Values) and null_count
+                    // would be missing as well.
                     map.insert(key, value);
                 } else {
-                    info!(
-                        "column of id {} is excluded from column statistics, unsupported data type {}",
-                        key, data_type
-                    );
+                    let data_type = value.max.as_ref().infer_data_type();
+                    if supported_stat_type(&data_type) {
+                        map.insert(key, value);
+                    } else {
+                        info!(
+                            "column of id {} is excluded from column statistics, unsupported data type {}",
+                            key, data_type
+                        );
+                    }
                 }
             }
         }
diff --git a/tests/sqllogictests/suites/no_table_meta_cache/col_stats_of_all_null.test b/tests/sqllogictests/suites/no_table_meta_cache/col_stats_of_all_null.test
@@ -0,0 +1,53 @@
+statement ok
+create or replace database col_stats_all_null;
+
+statement ok
+use col_stats_all_null;
+
+
+statement ok
+create or replace table t(c int) STORAGE_FORMAT=parquet;
+
+statement ok
+insert into t values(NULL);
+
+# segments should be pruned
+query T
+explain select * from t where c > 6;
+----
+Filter
+├── output columns: [t.c (#0)]
+├── filters: [is_true(t.c (#0) > 6)]
+├── estimated rows: 0.00
+└── TableScan
+    ├── table: default.col_stats_all_null.t
+    ├── output columns: [c (#0)]
+    ├── read rows: 0
+    ├── read size: 0
+    ├── partitions total: 1
+    ├── partitions scanned: 0
+    ├── pruning stats: [segments: <range pruning: 1 to 0>]
+    ├── push downs: [filters: [is_true(t.c (#0) > 6)], limit: NONE]
+    └── estimated rows: 1.00
+
+
+statement ok
+create or replace table t(c int) STORAGE_FORMAT=native;
+
+statement ok
+insert into t values(NULL);
+
+# segments should be pruned
+query T
+explain select * from t where c > 6;
+----
+TableScan
+├── table: default.col_stats_all_null.t
+├── output columns: [c (#0)]
+├── read rows: 0
+├── read size: 0
+├── partitions total: 1
+├── partitions scanned: 0
+├── pruning stats: [segments: <range pruning: 1 to 0>]
+├── push downs: [filters: [is_true(t.c (#0) > 6)], limit: NONE]
+└── estimated rows: 0.00

Original file line number	Diff line number	Diff line change
`@@ -491,7 +491,7 @@ pub fn build_comparator<'a>(`
`491`	`491`	`}`
`492`	`492`
`493`	`493`	/// returns a comparison function between any two arrays of each pair of arrays, according to `SortOptions`.
`494`		-/// Implementing custom `build_compare_fn` for unsupportd data types.
	`494`	+/// Implementing custom `build_compare_fn` for unsupported data types.
`495`	`495`	`pub fn build_comparator_impl<'a>(`
`496`	`496`	`pairs: &'a [(&'a [&'a dyn Array], &SortOptions)],`
`497`	`497`	`build_compare_fn: &dyn Fn(&dyn Array, &dyn Array) -> Result<DynComparator>,`
Original file line number	Diff line number	Diff line change
`@@ -112,8 +112,8 @@ where T: AsRef<[u8]>`
`112`	`112`	`if available.is_empty() {`
`113`	`113`	`return 0;`
`114`	`114`	`}`
`115`		`- for (index, byt) in available.iter().enumerate() {`
`116`		`- if !f(*byt) {`
	`115`	`+ for (index, bytes) in available.iter().enumerate() {`
	`116`	`+ if !f(*bytes) {`
`117`	`117`	`self.consume(index);`
`118`	`118`	`return index;`
`119`	`119`	`}`