Improve the doc (#95)

xudong963 · web-flow · commit f3d5eb1c7e3a · 2025-10-24T09:10:37.000Z
diff --git a/src/lib.rs b/src/lib.rs
@@ -17,7 +17,37 @@
 
 #![deny(missing_docs)]
 
-//! `datafusion-materialized-views` implements algorithms and functionality for materialized views in DataFusion.
+//! # datafusion-materialized-views
+//!
+//! `datafusion-materialized-views` provides robust algorithms and core functionality for working with materialized views in [DataFusion](https://arrow.apache.org/datafusion/).
+//!
+//! ## Key Features
+//!
+//! - **Incremental View Maintenance**: Efficiently tracks dependencies between Hive-partitioned tables and their materialized views, allowing users to determine which partitions need to be refreshed when source data changes. This is achieved via UDTFs such as `mv_dependencies` and `stale_files`.
+//! - **Query Rewriting**: Implements a view matching optimizer that rewrites queries to automatically leverage materialized views when beneficial, based on the techniques described in the [paper](https://dsg.uwaterloo.ca/seminars/notes/larson-paper.pdf).
+//! - **Pluggable Metadata Sources**: Supports custom metadata sources for incremental view maintenance, with default support for object store metadata via the `FileMetadata` and `RowMetadataRegistry` components.
+//! - **Extensible Table Abstractions**: Defines traits such as `ListingTableLike` and `Materialized` to abstract over Hive-partitioned tables and materialized views, enabling custom implementations and easy registration for use in the maintenance and rewriting logic.
+//!
+//! ## Typical Workflow
+//!
+//! 1. **Define and Register Views**: Implement a custom table type that implements the `Materialized` trait, and register it using `register_materialized`.
+//! 2. **Metadata Initialization**: Set up `FileMetadata` and `RowMetadataRegistry` to track file-level and row-level metadata.
+//! 3. **Dependency Tracking**: Use the `mv_dependencies` UDTF to generate build graphs for materialized views, and `stale_files` to identify partitions that require recomputation.
+//! 4. **Query Optimization**: Enable the query rewriting optimizer to transparently rewrite queries to use materialized views where possible.
+//!
+//! ## Example
+//!
+//! See the README and integration tests for a full walkthrough of setting up and maintaining a materialized view, including dependency tracking and query rewriting.
+//!
+//! ## Limitations
+//!
+//! - Currently supports only Hive-partitioned tables in object storage, with the smallest update unit being a file.
+//! - Future work may generalize to other storage backends and partitioning schemes.
+//!
+//! ## References
+//!
+//! - [Optimizing Queries Using Materialized Views: A Practical, Scalable Solution](https://dsg.uwaterloo.ca/seminars/notes/larson-paper.pdf)
+//! - [DataFusion documentation](https://datafusion.apache.org/)
 
 /// Code for incremental view maintenance against Hive-partitioned tables.
 ///
diff --git a/src/materialized/dependencies.rs b/src/materialized/dependencies.rs
@@ -62,7 +62,8 @@ use crate::materialized::META_COLUMN;
 
 use super::{cast_to_materialized, row_metadata::RowMetadataRegistry, util, Materialized};
 
-/// A table function that shows build targets and dependencies for a materialized view:
+/// A table function that, for a given materialized view, lists all the output data objects (build targets)
+/// generated during its construction or refresh, as well as all the source data objects (dependencies) it relies on.
 ///
 /// ```ignore
 /// fn mv_dependencies(table_ref: Utf8) -> Table
diff --git a/src/materialized/util.rs b/src/materialized/util.rs
@@ -21,6 +21,7 @@ use datafusion::catalog::{CatalogProviderList, TableProvider};
 use datafusion_common::{DataFusionError, Result};
 use datafusion_sql::ResolvedTableReference;
 
+/// Retrieves a table from the catalog list given a resolved table reference.
 pub fn get_table(
     catalog_list: &dyn CatalogProviderList,
     table_ref: &ResolvedTableReference,
@@ -35,6 +36,7 @@ pub fn get_table(
 
     // NOTE: this is bad, we are calling async code in a sync context.
     // We should file an issue about async in UDTFs.
+    // See: https://github.com/apache/datafusion/issues/17663
     futures::executor::block_on(schema.table(table_ref.table.as_ref()))
         .map_err(|e| e.context(format!("couldn't get table '{}'", table_ref.table)))?
         .ok_or_else(|| DataFusionError::Plan(format!("no such table {}", table_ref.schema)))