Skip to content

Commit 78853a8

Browse files
File pruning when set (#3363)
Add whole file pruning to the vortex-file/ffi Signed-off-by: Joe Isaacs <[email protected]> --------- Signed-off-by: Joe Isaacs <[email protected]>
1 parent 11554a4 commit 78853a8

File tree

11 files changed

+143
-108
lines changed

11 files changed

+143
-108
lines changed

duckdb-vortex/src/include/vortex_common.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ struct FileReader {
6161
return duckdb::make_uniq<FileReader>(file);
6262
}
6363

64+
vx_array_iterator *Scan(const vx_file_scan_options *options) {
65+
return Try([&](auto err) { return vx_file_reader_scan(this->file, options, err); });
66+
}
67+
6468
uint64_t FileRowCount() {
6569
return Try([&](auto err) { return vx_file_row_count(file, err); });
6670
}

duckdb-vortex/src/include/vortex_layout_reader.hpp

Lines changed: 0 additions & 29 deletions
This file was deleted.

duckdb-vortex/src/vortex_scan.cpp

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
#include "concurrentqueue.h"
1919

2020
#include "vortex.hpp"
21-
#include "vortex_extension.hpp"
22-
#include "vortex_layout_reader.hpp"
2321
#include "vortex_scan.hpp"
2422
#include "vortex_common.hpp"
2523
#include "vortex_expr.hpp"
@@ -42,7 +40,7 @@ struct BindData : public TableFunctionData {
4240

4341
// Used to read the schema during the bind phase and cached here to
4442
// avoid having to open the same file again during the scan phase.
45-
unique_ptr<FileReader> initial_file;
43+
shared_ptr<FileReader> initial_file;
4644

4745
// Used to create an arena for protobuf exprs, need a ptr since the bind arg is const.
4846
unique_ptr<google::protobuf::Arena> arena;
@@ -107,7 +105,7 @@ struct ScanGlobalState : public GlobalTableFunctionState {
107105
// Multi producer, multi consumer lockfree queue.
108106
duckdb_moodycamel::ConcurrentQueue<ScanPartition> scan_partitions {8192};
109107

110-
std::vector<std::shared_ptr<LayoutReader>> layout_readers;
108+
std::vector<shared_ptr<FileReader>> file_readers;
111109

112110
// The column idx that must be returned by the scan.
113111
vector<idx_t> column_ids;
@@ -252,9 +250,9 @@ static bool PinFileToThread(ScanGlobalState &global_state) {
252250
}
253251

254252
static void CreateScanPartitions(ClientContext &context, const BindData &bind, ScanGlobalState &global_state,
255-
ScanLocalState &local_state, uint64_t file_idx, unique_ptr<FileReader> &file_reader) {
253+
ScanLocalState &local_state, uint64_t file_idx, FileReader &file_reader) {
256254
const auto file_name = global_state.expanded_files[file_idx];
257-
const auto row_count = Try([&](auto err) { return vx_file_row_count(file_reader->file, err); });
255+
const auto row_count = Try([&](auto err) { return vx_file_row_count(file_reader.file, err); });
258256

259257
const auto thread_count = std::thread::hardware_concurrency();
260258
const auto file_count = global_state.expanded_files.size();
@@ -292,8 +290,7 @@ static void CreateScanPartitions(ClientContext &context, const BindData &bind, S
292290
D_ASSERT(global_state.files_partitioned <= global_state.expanded_files.size());
293291
}
294292

295-
static unique_ptr<ArrayIterator> OpenArrayIter(ScanGlobalState &global_state,
296-
std::shared_ptr<LayoutReader> &layout_reader,
293+
static unique_ptr<ArrayIterator> OpenArrayIter(ScanGlobalState &global_state, shared_ptr<FileReader> &file_reader,
297294
ScanPartition row_range_partition) {
298295
const auto options = vx_file_scan_options {
299296
.projection = global_state.projected_column_names.data(),
@@ -305,7 +302,7 @@ static unique_ptr<ArrayIterator> OpenArrayIter(ScanGlobalState &global_state,
305302
.row_range_end = row_range_partition.end_row,
306303
};
307304

308-
return make_uniq<ArrayIterator>(layout_reader->Scan(&options));
305+
return make_uniq<ArrayIterator>(file_reader->Scan(&options));
309306
}
310307

311308
// Assigns the next array from the array stream.
@@ -348,8 +345,8 @@ static bool GetNextArray(ClientContext &context, const BindData &bind_data, Scan
348345

349346
// Layout readers are safe to share across threads for reading. Further, they
350347
// are created before pushing partitions of the corresponing files into a queue.
351-
auto layout_reader = global_state.layout_readers[partition.file_idx];
352-
local_state.array_iterator = OpenArrayIter(global_state, layout_reader, partition);
348+
auto file_reader = global_state.file_readers[partition.file_idx];
349+
local_state.array_iterator = OpenArrayIter(global_state, file_reader, partition);
353350
}
354351

355352
local_state.currently_scanned_array = local_state.array_iterator->NextArray();
@@ -378,20 +375,25 @@ static void VortexScanFunction(ClientContext &context, TableFunctionInput &data,
378375
return;
379376
}
380377

381-
// Free layout readers as long as we pin files to threads.
378+
// Free file readers when owned by the thread.
382379
if (local_state.scan_partitions.empty() && local_state.thread_local_file_idx.has_value()) {
383-
global_state.layout_readers[local_state.thread_local_file_idx.value()] = nullptr;
380+
global_state.file_readers[local_state.thread_local_file_idx.value()] = nullptr;
384381
local_state.thread_local_file_idx.reset();
385382
}
386383

387384
// Create new scan partitions in case the queue is empty.
388385
if (auto file_idx = global_state.next_file_idx.fetch_add(1);
389386
file_idx < global_state.expanded_files.size()) {
390-
auto file_name = global_state.expanded_files[file_idx];
391-
auto vortex_file =
392-
OpenFileAndVerify(FileSystem::GetFileSystem(context), *bind_data.session, file_name, bind_data);
393-
global_state.layout_readers[file_idx] = LayoutReader::CreateFromFile(vortex_file.get());
394-
CreateScanPartitions(context, bind_data, global_state, local_state, file_idx, vortex_file);
387+
if (file_idx == 0) {
388+
global_state.file_readers[0] = bind_data.initial_file;
389+
} else {
390+
auto file_name = global_state.expanded_files[file_idx];
391+
global_state.file_readers[file_idx] =
392+
OpenFileAndVerify(FileSystem::GetFileSystem(context), *bind_data.session, file_name, bind_data);
393+
}
394+
395+
CreateScanPartitions(context, bind_data, global_state, local_state, file_idx,
396+
*global_state.file_readers[file_idx]);
395397
}
396398
}
397399
}
@@ -491,7 +493,7 @@ void RegisterScanFunction(DatabaseInstance &instance) {
491493
}
492494

493495
// Resizing the empty vector default constructs std::shared pointers at all indices with nullptr.
494-
global_state->layout_readers.resize(global_state->expanded_files.size());
496+
global_state->file_readers.resize(global_state->expanded_files.size());
495497

496498
bind.arena->Reset();
497499
return std::move(global_state);

vortex-expr/src/pruning.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ pub enum FieldOrIdentity {
397397
Identity,
398398
}
399399

400-
pub(crate) fn stat_field_name(field: &FieldName, stat: Stat) -> FieldName {
400+
pub fn stat_field_name(field: &FieldName, stat: Stat) -> FieldName {
401401
FieldName::from(stat_field_name_string(field, stat))
402402
}
403403

vortex-ffi/cinclude/vortex.h

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,6 @@ typedef struct vx_error vx_error;
119119
*/
120120
typedef struct vx_file_reader vx_file_reader;
121121

122-
/**
123-
* A Vortex layout reader.
124-
*/
125-
typedef struct vx_layout_reader vx_layout_reader;
126-
127122
/**
128123
* An object that stores registries and caches.
129124
* This should if possible be reused between queries in ann interactive session.
@@ -404,23 +399,15 @@ struct vx_dtype *vx_file_dtype(const struct vx_file_reader *file);
404399
/**
405400
* Build a new `vx_array_iterator` that returns a series of `vx_array`s from a scan over a `vx_layout_reader`.
406401
*/
407-
struct vx_array_iterator *vx_layout_reader_scan(const struct vx_layout_reader *layout_reader,
408-
const struct vx_file_scan_options *opts,
409-
struct vx_error **error);
402+
struct vx_array_iterator *vx_file_reader_scan(const struct vx_file_reader *file_reader,
403+
const struct vx_file_scan_options *opts,
404+
struct vx_error **error);
410405

411406
/**
412407
* Returns the row count for a given file reader.
413408
*/
414409
uint64_t vx_file_row_count(struct vx_file_reader *file_reader, struct vx_error **error);
415410

416-
/**
417-
* Creates a layout reader for a given file.
418-
*/
419-
struct vx_layout_reader *vx_layout_reader_create(struct vx_file_reader *file_reader,
420-
struct vx_error **error);
421-
422-
void vx_layout_reader_free(struct vx_layout_reader *layout_reader);
423-
424411
/**
425412
* Free the file and all associated resources.
426413
*

vortex-ffi/src/array.rs

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,6 @@ pub struct vx_array_iterator {
2929
pub inner: Option<Box<dyn ArrayIterator>>,
3030
}
3131

32-
/// Creates a new ArrayIterator wrapper for FFI use.
33-
pub fn vx_array_iterator<I>(iter: I) -> vortex::error::VortexResult<*mut vx_array_iterator>
34-
where
35-
I: ArrayIterator + 'static,
36-
{
37-
let inner = Some(Box::new(iter) as Box<dyn ArrayIterator>);
38-
Ok(Box::into_raw(Box::new(vx_array_iterator { inner })))
39-
}
40-
4132
/// Attempt to advance the `current` pointer of the iterator.
4233
///
4334
/// A return value of `true` indicates that another element was pulled from the iterator, and a return

vortex-ffi/src/file.rs

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use std::ffi::{CStr, c_char, c_int, c_uint, c_ulong};
44
use std::str::FromStr;
55
use std::sync::Arc;
6-
use std::{ptr, slice};
6+
use std::{iter, ptr, slice};
77

88
use itertools::Itertools;
99
use object_store::aws::{AmazonS3Builder, AmazonS3ConfigKey};
@@ -18,7 +18,7 @@ use vortex::error::{VortexError, VortexExpect, VortexResult, vortex_bail, vortex
1818
use vortex::expr::{ExprRef, Identity, deserialize_expr, select};
1919
use vortex::file::scan::SplitBy;
2020
use vortex::file::{VortexFile, VortexOpenOptions, VortexWriteOptions};
21-
use vortex::layout::LayoutReader;
21+
use vortex::iter::ArrayIteratorAdapter;
2222
use vortex::layout::scan::ScanBuilder;
2323
use vortex::proto::expr::Expr;
2424

@@ -33,12 +33,6 @@ pub struct vx_file_reader {
3333
pub inner: VortexFile,
3434
}
3535

36-
/// A Vortex layout reader.
37-
#[allow(non_camel_case_types)]
38-
pub struct vx_layout_reader {
39-
pub inner: Arc<dyn LayoutReader>,
40-
}
41-
4236
/// Options supplied for opening a file.
4337
#[repr(C)]
4438
pub struct vx_file_open_options {
@@ -242,20 +236,33 @@ pub unsafe extern "C-unwind" fn vx_file_dtype(file: *const vx_file_reader) -> *m
242236

243237
/// Build a new `vx_array_iterator` that returns a series of `vx_array`s from a scan over a `vx_layout_reader`.
244238
#[unsafe(no_mangle)]
245-
pub unsafe extern "C-unwind" fn vx_layout_reader_scan(
246-
layout_reader: *const vx_layout_reader,
239+
pub unsafe extern "C-unwind" fn vx_file_reader_scan(
240+
file_reader: *const vx_file_reader,
247241
opts: *const vx_file_scan_options,
248242
error: *mut *mut vx_error,
249243
) -> *mut vx_array_iterator {
250244
try_or(error, ptr::null_mut(), || {
251-
let layout_reader = unsafe { layout_reader.as_ref().vortex_expect("null layout reader") };
252-
let mut scan_builder = ScanBuilder::new(layout_reader.inner.clone());
245+
let file_reader = unsafe { file_reader.as_ref().vortex_expect("null file reader") };
253246

254247
let scan_options = unsafe { opts.as_ref() }.map_or_else(
255248
|| Ok(ScanOptions::default()),
256249
|options| options.process_scan_options(),
257250
)?;
258251

252+
if let Some(expr) = &scan_options.filter_expr {
253+
if file_reader.inner.can_prune(expr)? {
254+
let dtype = file_reader.inner.dtype().clone();
255+
let empty_iter = ArrayIteratorAdapter::new(dtype, iter::empty());
256+
257+
return Ok(Box::into_raw(Box::new(vx_array_iterator {
258+
inner: Some(Box::new(empty_iter)),
259+
})));
260+
}
261+
};
262+
263+
let layout_reader = file_reader.inner.layout_reader()?;
264+
let mut scan_builder = ScanBuilder::new(layout_reader.clone());
265+
259266
// Apply options if provided.
260267
if let Some(field_names) = scan_options.field_names {
261268
// Field names are allowed to be `Some` and empty.
@@ -274,7 +281,9 @@ pub unsafe extern "C-unwind" fn vx_layout_reader_scan(
274281
scan_builder = scan_builder.with_split_by(split_by_value);
275282
}
276283

277-
vx_array_iterator(scan_builder.into_array_iter()?)
284+
Ok(Box::into_raw(Box::new(vx_array_iterator {
285+
inner: Some(Box::new(scan_builder.into_array_iter()?)),
286+
})))
278287
})
279288
}
280289

@@ -290,27 +299,6 @@ pub extern "C-unwind" fn vx_file_row_count(
290299
})
291300
}
292301

293-
/// Creates a layout reader for a given file.
294-
#[unsafe(no_mangle)]
295-
pub extern "C-unwind" fn vx_layout_reader_create(
296-
file_reader: *mut vx_file_reader,
297-
error: *mut *mut vx_error,
298-
) -> *mut vx_layout_reader {
299-
try_or(error, ptr::null_mut(), || {
300-
let file_reader = unsafe { file_reader.as_ref().vortex_expect("null file_reader") };
301-
let inner = file_reader.inner.layout_reader()?;
302-
303-
Ok(Box::into_raw(Box::new(vx_layout_reader { inner })))
304-
})
305-
}
306-
307-
#[unsafe(no_mangle)]
308-
pub extern "C-unwind" fn vx_layout_reader_free(layout_reader: *mut vx_layout_reader) {
309-
if !layout_reader.is_null() {
310-
drop(unsafe { Box::from_raw(layout_reader) });
311-
}
312-
}
313-
314302
/// Free the file and all associated resources.
315303
///
316304
/// This function will not automatically free any :c:func:`vx_array_iterator` that were built from

vortex-file/src/file.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,15 @@ use vortex_array::ArrayRef;
88
use vortex_array::stats::StatsSet;
99
use vortex_dtype::DType;
1010
use vortex_error::VortexResult;
11+
use vortex_expr::ExprRef;
12+
use vortex_expr::pruning::PruningPredicate;
1113
use vortex_layout::LayoutReader;
1214
use vortex_layout::scan::ScanBuilder;
1315
use vortex_layout::segments::SegmentSource;
1416
use vortex_metrics::VortexMetrics;
1517

1618
use crate::footer::Footer;
19+
use crate::pruning::extract_relevant_stat_as_struct_row;
1720

1821
/// Represents a Vortex file, providing access to its metadata and content.
1922
///
@@ -80,6 +83,32 @@ impl VortexFile {
8083
pub fn scan(&self) -> VortexResult<ScanBuilder<ArrayRef>> {
8184
Ok(ScanBuilder::new(self.layout_reader()?).with_metrics(self.metrics.clone()))
8285
}
86+
87+
// Returns true if the expression will never match any rows in the file.
88+
pub fn can_prune(&self, filter: &ExprRef) -> VortexResult<bool> {
89+
let Some((file_stats, struct_dtype)) = self
90+
.footer
91+
.statistics()
92+
.zip(self.footer.dtype().as_struct())
93+
else {
94+
return Ok(false);
95+
};
96+
97+
let Some(predicate) = PruningPredicate::try_new(filter) else {
98+
return Ok(false);
99+
};
100+
101+
let Some(struct_row) =
102+
extract_relevant_stat_as_struct_row(&predicate, file_stats, struct_dtype)?
103+
else {
104+
return Ok(false);
105+
};
106+
107+
Ok(predicate
108+
.evaluate(&struct_row)?
109+
.and_then(|p| p.as_constant())
110+
.is_some_and(|result| result.as_bool().value() == Some(true)))
111+
}
83112
}
84113

85114
/// A factory for creating segment sources that read data from a Vortex file.

vortex-file/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ mod footer;
9696
mod generic;
9797
mod memory;
9898
mod open;
99+
mod pruning;
99100
pub mod segments;
100101
mod strategy;
101102
#[cfg(test)]

0 commit comments

Comments
 (0)