Skip to content

Commit 003813a

Browse files
authored
Optimize Statistics::projection (apache#13225)
1 parent 9005585 commit 003813a

File tree

1 file changed

+73
-5
lines changed

1 file changed

+73
-5
lines changed

datafusion/common/src/stats.rs

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,13 +268,35 @@ impl Statistics {
268268
return self;
269269
};
270270

271-
// todo: it would be nice to avoid cloning column statistics if
272-
// possible (e.g. if the projection did not contain duplicates)
273-
self.column_statistics = projection
274-
.iter()
275-
.map(|&i| self.column_statistics[i].clone())
271+
enum Slot {
272+
/// The column is taken and put into the specified statistics location
273+
Taken(usize),
274+
/// The original columns is present
275+
Present(ColumnStatistics),
276+
}
277+
278+
// Convert to Vec<Slot> so we can avoid copying the statistics
279+
let mut columns: Vec<_> = std::mem::take(&mut self.column_statistics)
280+
.into_iter()
281+
.map(Slot::Present)
276282
.collect();
277283

284+
for idx in projection {
285+
let next_idx = self.column_statistics.len();
286+
let slot = std::mem::replace(
287+
columns.get_mut(*idx).expect("projection out of bounds"),
288+
Slot::Taken(next_idx),
289+
);
290+
match slot {
291+
// The column was there, so just move it
292+
Slot::Present(col) => self.column_statistics.push(col),
293+
// The column was taken, so copy from the previous location
294+
Slot::Taken(prev_idx) => self
295+
.column_statistics
296+
.push(self.column_statistics[prev_idx].clone()),
297+
}
298+
}
299+
278300
self
279301
}
280302

@@ -581,4 +603,50 @@ mod tests {
581603
let p2 = precision.clone();
582604
assert_eq!(precision, p2);
583605
}
606+
607+
#[test]
608+
fn test_project_none() {
609+
let projection = None;
610+
let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
611+
assert_eq!(stats, make_stats(vec![10, 20, 30]));
612+
}
613+
614+
#[test]
615+
fn test_project_empty() {
616+
let projection = Some(vec![]);
617+
let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
618+
assert_eq!(stats, make_stats(vec![]));
619+
}
620+
621+
#[test]
622+
fn test_project_swap() {
623+
let projection = Some(vec![2, 1]);
624+
let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
625+
assert_eq!(stats, make_stats(vec![30, 20]));
626+
}
627+
628+
#[test]
629+
fn test_project_repeated() {
630+
let projection = Some(vec![1, 2, 1, 1, 0, 2]);
631+
let stats = make_stats(vec![10, 20, 30]).project(projection.as_ref());
632+
assert_eq!(stats, make_stats(vec![20, 30, 20, 20, 10, 30]));
633+
}
634+
635+
// Make a Statistics structure with the specified null counts for each column
636+
fn make_stats(counts: impl IntoIterator<Item = usize>) -> Statistics {
637+
Statistics {
638+
num_rows: Precision::Exact(42),
639+
total_byte_size: Precision::Exact(500),
640+
column_statistics: counts.into_iter().map(col_stats_i64).collect(),
641+
}
642+
}
643+
644+
fn col_stats_i64(null_count: usize) -> ColumnStatistics {
645+
ColumnStatistics {
646+
null_count: Precision::Exact(null_count),
647+
max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
648+
min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
649+
distinct_count: Precision::Exact(100),
650+
}
651+
}
584652
}

0 commit comments

Comments
 (0)