|
| 1 | +//! Utilities for interacting with Kernel APIs using Arrow data structures. |
| 2 | +//! |
| 3 | +use delta_kernel::arrow::array::BooleanArray; |
| 4 | +use delta_kernel::arrow::compute::filter_record_batch; |
| 5 | +use delta_kernel::arrow::record_batch::RecordBatch; |
| 6 | +use delta_kernel::engine::arrow_data::ArrowEngineData; |
| 7 | +use delta_kernel::scan::{Scan, ScanMetadata}; |
| 8 | +use delta_kernel::{ |
| 9 | + DeltaResult, Engine, EngineData, ExpressionEvaluator, ExpressionRef, PredicateRef, Version, |
| 10 | +}; |
| 11 | +use itertools::Itertools; |
| 12 | + |
| 13 | +/// [`ScanMetadata`] contains (1) a [`RecordBatch`] specifying data files to be scanned |
| 14 | +/// and (2) a vector of transforms (one transform per scan file) that must be applied to the data read |
| 15 | +/// from those files. |
| 16 | +pub(crate) struct ScanMetadataArrow { |
| 17 | + /// Record batch with one row per file to scan |
| 18 | + pub scan_files: RecordBatch, |
| 19 | + |
| 20 | + /// Row-level transformations to apply to data read from files. |
| 21 | + /// |
| 22 | + /// Each entry in this vector corresponds to a row in the `scan_files` data. The entry is an |
| 23 | + /// expression that must be applied to convert the file's data into the logical schema |
| 24 | + /// expected by the scan: |
| 25 | + /// |
| 26 | + /// - `Some(expr)`: Apply this expression to transform the data to match [`Scan::schema()`]. |
| 27 | + /// - `None`: No transformation is needed; the data is already in the correct logical form. |
| 28 | + /// |
| 29 | + /// Note: This vector can be indexed by row number. |
| 30 | + pub scan_file_transforms: Vec<Option<ExpressionRef>>, |
| 31 | +} |
| 32 | + |
| 33 | +pub(crate) trait ScanExt { |
| 34 | + /// Get the metadata for a table scan. |
| 35 | + /// |
| 36 | + /// This method handles translation between `EngineData` and `RecordBatch` |
| 37 | + /// and will already apply any selection vectors to the data. |
| 38 | + /// See [`Scan::scan_metadata`] for details. |
| 39 | + fn scan_metadata_arrow( |
| 40 | + &self, |
| 41 | + engine: &dyn Engine, |
| 42 | + ) -> DeltaResult<impl Iterator<Item = DeltaResult<ScanMetadataArrow>>>; |
| 43 | + |
| 44 | + fn scan_metadata_from_arrow( |
| 45 | + &self, |
| 46 | + engine: &dyn Engine, |
| 47 | + existing_version: Version, |
| 48 | + existing_data: Box<dyn Iterator<Item = RecordBatch>>, |
| 49 | + existing_predicate: Option<PredicateRef>, |
| 50 | + ) -> DeltaResult<impl Iterator<Item = DeltaResult<ScanMetadataArrow>>>; |
| 51 | +} |
| 52 | + |
| 53 | +impl ScanExt for Scan { |
| 54 | + fn scan_metadata_arrow( |
| 55 | + &self, |
| 56 | + engine: &dyn Engine, |
| 57 | + ) -> DeltaResult<impl Iterator<Item = DeltaResult<ScanMetadataArrow>>> { |
| 58 | + Ok(self |
| 59 | + .scan_metadata(engine)? |
| 60 | + .map_ok(kernel_to_arrow) |
| 61 | + .flatten()) |
| 62 | + } |
| 63 | + |
| 64 | + fn scan_metadata_from_arrow( |
| 65 | + &self, |
| 66 | + engine: &dyn Engine, |
| 67 | + existing_version: Version, |
| 68 | + existing_data: Box<dyn Iterator<Item = RecordBatch>>, |
| 69 | + existing_predicate: Option<PredicateRef>, |
| 70 | + ) -> DeltaResult<impl Iterator<Item = DeltaResult<ScanMetadataArrow>>> { |
| 71 | + let engine_iter = |
| 72 | + existing_data.map(|batch| Box::new(ArrowEngineData::new(batch)) as Box<dyn EngineData>); |
| 73 | + Ok(self |
| 74 | + .scan_metadata_from(engine, existing_version, engine_iter, existing_predicate)? |
| 75 | + .map_ok(kernel_to_arrow) |
| 76 | + .flatten()) |
| 77 | + } |
| 78 | +} |
| 79 | + |
| 80 | +fn kernel_to_arrow(metadata: ScanMetadata) -> DeltaResult<ScanMetadataArrow> { |
| 81 | + let scan_file_transforms = metadata |
| 82 | + .scan_file_transforms |
| 83 | + .into_iter() |
| 84 | + .enumerate() |
| 85 | + .filter_map(|(i, v)| metadata.scan_files.selection_vector[i].then_some(v)) |
| 86 | + .collect(); |
| 87 | + let batch = ArrowEngineData::try_from_engine_data(metadata.scan_files.data)?.into(); |
| 88 | + let scan_files = filter_record_batch( |
| 89 | + &batch, |
| 90 | + &BooleanArray::from(metadata.scan_files.selection_vector), |
| 91 | + )?; |
| 92 | + Ok(ScanMetadataArrow { |
| 93 | + scan_files, |
| 94 | + scan_file_transforms, |
| 95 | + }) |
| 96 | +} |
| 97 | + |
| 98 | +pub(crate) trait ExpressionEvaluatorExt { |
| 99 | + fn evaluate_arrow(&self, batch: RecordBatch) -> DeltaResult<RecordBatch>; |
| 100 | +} |
| 101 | + |
| 102 | +impl<T: ExpressionEvaluator + ?Sized> ExpressionEvaluatorExt for T { |
| 103 | + fn evaluate_arrow(&self, batch: RecordBatch) -> DeltaResult<RecordBatch> { |
| 104 | + let engine_data = ArrowEngineData::new(batch); |
| 105 | + Ok(ArrowEngineData::try_from_engine_data(T::evaluate(self, &engine_data)?)?.into()) |
| 106 | + } |
| 107 | +} |
| 108 | + |
| 109 | +#[cfg(test)] |
| 110 | +mod tests { |
| 111 | + use std::sync::Arc; |
| 112 | + |
| 113 | + use super::ExpressionEvaluatorExt as _; |
| 114 | + |
| 115 | + use delta_kernel::arrow::array::Int32Array; |
| 116 | + use delta_kernel::arrow::datatypes::{DataType, Field, Schema}; |
| 117 | + use delta_kernel::arrow::record_batch::RecordBatch; |
| 118 | + use delta_kernel::engine::arrow_conversion::TryIntoKernel; |
| 119 | + use delta_kernel::engine::arrow_expression::ArrowEvaluationHandler; |
| 120 | + use delta_kernel::expressions::*; |
| 121 | + use delta_kernel::EvaluationHandler; |
| 122 | + |
| 123 | + #[test] |
| 124 | + fn test_evaluate_arrow() { |
| 125 | + let handler = ArrowEvaluationHandler; |
| 126 | + |
| 127 | + let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); |
| 128 | + let values = Int32Array::from(vec![1, 2, 3]); |
| 129 | + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(values)]).unwrap(); |
| 130 | + |
| 131 | + let expression = column_expr!("a"); |
| 132 | + let expr = handler.new_expression_evaluator( |
| 133 | + Arc::new((&schema).try_into_kernel().unwrap()), |
| 134 | + expression, |
| 135 | + delta_kernel::schema::DataType::INTEGER, |
| 136 | + ); |
| 137 | + |
| 138 | + let result = expr.evaluate_arrow(batch); |
| 139 | + assert!(result.is_ok()); |
| 140 | + } |
| 141 | +} |
0 commit comments