diff --git a/src/lib.rs b/src/lib.rs index 238bbd7..fd6dd8e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,7 +17,37 @@ #![deny(missing_docs)] -//! `datafusion-materialized-views` implements algorithms and functionality for materialized views in DataFusion. +//! # datafusion-materialized-views +//! +//! `datafusion-materialized-views` provides robust algorithms and core functionality for working with materialized views in [DataFusion](https://arrow.apache.org/datafusion/). +//! +//! ## Key Features +//! +//! - **Incremental View Maintenance**: Efficiently tracks dependencies between Hive-partitioned tables and their materialized views, allowing users to determine which partitions need to be refreshed when source data changes. This is achieved via UDTFs such as `mv_dependencies` and `stale_files`. +//! - **Query Rewriting**: Implements a view matching optimizer that rewrites queries to automatically leverage materialized views when beneficial, based on the techniques described in the [paper](https://dsg.uwaterloo.ca/seminars/notes/larson-paper.pdf). +//! - **Pluggable Metadata Sources**: Supports custom metadata sources for incremental view maintenance, with default support for object store metadata via the `FileMetadata` and `RowMetadataRegistry` components. +//! - **Extensible Table Abstractions**: Defines traits such as `ListingTableLike` and `Materialized` to abstract over Hive-partitioned tables and materialized views, enabling custom implementations and easy registration for use in the maintenance and rewriting logic. +//! +//! ## Typical Workflow +//! +//! 1. **Define and Register Views**: Implement a custom table type that implements the `Materialized` trait, and register it using `register_materialized`. +//! 2. **Metadata Initialization**: Set up `FileMetadata` and `RowMetadataRegistry` to track file-level and row-level metadata. +//! 3. **Dependency Tracking**: Use the `mv_dependencies` UDTF to generate build graphs for materialized views, and `stale_files` to identify partitions that require recomputation. +//! 4. **Query Optimization**: Enable the query rewriting optimizer to transparently rewrite queries to use materialized views where possible. +//! +//! ## Example +//! +//! See the README and integration tests for a full walkthrough of setting up and maintaining a materialized view, including dependency tracking and query rewriting. +//! +//! ## Limitations +//! +//! - Currently supports only Hive-partitioned tables in object storage, with the smallest update unit being a file. +//! - Future work may generalize to other storage backends and partitioning schemes. +//! +//! ## References +//! +//! - [Optimizing Queries Using Materialized Views: A Practical, Scalable Solution](https://dsg.uwaterloo.ca/seminars/notes/larson-paper.pdf) +//! - [DataFusion documentation](https://datafusion.apache.org/) /// Code for incremental view maintenance against Hive-partitioned tables. /// diff --git a/src/materialized/dependencies.rs b/src/materialized/dependencies.rs index 42ddd35..060e1e0 100644 --- a/src/materialized/dependencies.rs +++ b/src/materialized/dependencies.rs @@ -62,7 +62,8 @@ use crate::materialized::META_COLUMN; use super::{cast_to_materialized, row_metadata::RowMetadataRegistry, util, Materialized}; -/// A table function that shows build targets and dependencies for a materialized view: +/// A table function that, for a given materialized view, lists all the output data objects (build targets) +/// generated during its construction or refresh, as well as all the source data objects (dependencies) it relies on. /// /// ```ignore /// fn mv_dependencies(table_ref: Utf8) -> Table diff --git a/src/materialized/util.rs b/src/materialized/util.rs index 7977f8d..cb4afad 100644 --- a/src/materialized/util.rs +++ b/src/materialized/util.rs @@ -21,6 +21,7 @@ use datafusion::catalog::{CatalogProviderList, TableProvider}; use datafusion_common::{DataFusionError, Result}; use datafusion_sql::ResolvedTableReference; +/// Retrieves a table from the catalog list given a resolved table reference. pub fn get_table( catalog_list: &dyn CatalogProviderList, table_ref: &ResolvedTableReference, @@ -35,6 +36,7 @@ pub fn get_table( // NOTE: this is bad, we are calling async code in a sync context. // We should file an issue about async in UDTFs. + // See: https://github.com/apache/datafusion/issues/17663 futures::executor::block_on(schema.table(table_ref.table.as_ref())) .map_err(|e| e.context(format!("couldn't get table '{}'", table_ref.table)))? .ok_or_else(|| DataFusionError::Plan(format!("no such table {}", table_ref.schema)))