Skip to content

Commit 19af083

Browse files
authored
Merge pull request #19 from massive-com/row_group_limit_pruning
Row group limit pruning
2 parents 2809ef2 + e6dee7d commit 19af083

File tree

9 files changed

+881
-18
lines changed

9 files changed

+881
-18
lines changed

datafusion/core/tests/parquet/mod.rs

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,11 @@ impl TestOutput {
150150
self.metric_value("row_groups_matched_statistics")
151151
}
152152

153+
/// The number of row_groups fully matched by statistics
154+
fn row_groups_fully_matched_statistics(&self) -> Option<usize> {
155+
self.metric_value("row_groups_fully_matched_statistics")
156+
}
157+
153158
/// The number of row_groups pruned by statistics
154159
fn row_groups_pruned_statistics(&self) -> Option<usize> {
155160
self.metric_value("row_groups_pruned_statistics")
@@ -178,6 +183,11 @@ impl TestOutput {
178183
self.metric_value("page_index_rows_pruned")
179184
}
180185

186+
/// The number of row groups pruned by limit pruning
187+
fn limit_pruned_row_groups(&self) -> Option<usize> {
188+
self.metric_value("limit_pruned_row_groups")
189+
}
190+
181191
fn description(&self) -> String {
182192
format!(
183193
"Input:\n{}\nQuery:\n{}\nOutput:\n{}\nMetrics:\n{}",
@@ -191,20 +201,41 @@ impl TestOutput {
191201
/// and the appropriate scenario
192202
impl ContextWithParquet {
193203
async fn new(scenario: Scenario, unit: Unit) -> Self {
194-
Self::with_config(scenario, unit, SessionConfig::new()).await
204+
Self::with_config(scenario, unit, SessionConfig::new(), None, None).await
205+
}
206+
207+
/// Set custom schema and batches for the test
208+
pub async fn with_custom_data(
209+
scenario: Scenario,
210+
unit: Unit,
211+
schema: Arc<Schema>,
212+
batches: Vec<RecordBatch>,
213+
) -> Self {
214+
Self::with_config(
215+
scenario,
216+
unit,
217+
SessionConfig::new(),
218+
Some(schema),
219+
Some(batches),
220+
)
221+
.await
195222
}
196223

197224
async fn with_config(
198225
scenario: Scenario,
199226
unit: Unit,
200227
mut config: SessionConfig,
228+
custom_schema: Option<Arc<Schema>>,
229+
custom_batches: Option<Vec<RecordBatch>>,
201230
) -> Self {
202231
// Use a single partition for deterministic results no matter how many CPUs the host has
203232
config = config.with_target_partitions(1);
204233
let file = match unit {
205234
Unit::RowGroup(row_per_group) => {
206235
config = config.with_parquet_bloom_filter_pruning(true);
207-
make_test_file_rg(scenario, row_per_group).await
236+
config.options_mut().execution.parquet.pushdown_filters = true;
237+
make_test_file_rg(scenario, row_per_group, custom_schema, custom_batches)
238+
.await
208239
}
209240
Unit::Page(row_per_page) => {
210241
config = config.with_parquet_page_index_pruning(true);
@@ -1030,7 +1061,12 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
10301061
}
10311062

10321063
/// Create a test parquet file with various data types
1033-
async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTempFile {
1064+
async fn make_test_file_rg(
1065+
scenario: Scenario,
1066+
row_per_group: usize,
1067+
custom_schema: Option<Arc<Schema>>,
1068+
custom_batches: Option<Vec<RecordBatch>>,
1069+
) -> NamedTempFile {
10341070
let mut output_file = tempfile::Builder::new()
10351071
.prefix("parquet_pruning")
10361072
.suffix(".parquet")
@@ -1043,8 +1079,14 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem
10431079
.set_statistics_enabled(EnabledStatistics::Page)
10441080
.build();
10451081

1046-
let batches = create_data_batch(scenario);
1047-
let schema = batches[0].schema();
1082+
let (batches, schema) =
1083+
if let (Some(schema), Some(batches)) = (custom_schema, custom_batches) {
1084+
(batches, schema)
1085+
} else {
1086+
let batches = create_data_batch(scenario);
1087+
let schema = batches[0].schema();
1088+
(batches, schema)
1089+
};
10481090

10491091
let mut writer = ArrowWriter::try_new(&mut output_file, schema, Some(props)).unwrap();
10501092

0 commit comments

Comments
 (0)