Skip to content

Commit b7974e8

Browse files
authored
Add FileMetadata table and RowMetadataRegistry (#2)
* port file metadata + hive_partition + row metadata + add tests * better docs * more comments * fix link * fix another comment * allow the same licenses as datafusion-orc * add unicode-3.0 license to allowlist
1 parent 759aa19 commit b7974e8

File tree

8 files changed

+1690
-11
lines changed

8 files changed

+1690
-11
lines changed

Cargo.toml

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,29 @@ authors = ["Matthew Cramerus <matt@polygon.io>"]
2525
license = "Apache-2.0"
2626
description = "Materialized Views & Query Rewriting in DataFusion"
2727
keywords = ["arrow", "arrow-rs", "datafusion"]
28-
rust-version = "1.73"
28+
rust-version = "1.80"
2929

3030
[dependencies]
31+
arrow = "53"
32+
arrow-schema = "53"
33+
async-trait = "0.1"
34+
dashmap = "6"
35+
datafusion = "43"
36+
datafusion-common = "43"
37+
datafusion-expr = "43"
38+
datafusion-functions = "43"
39+
datafusion-functions-aggregate = "43"
40+
datafusion-physical-expr = "43"
41+
datafusion-physical-plan = "43"
42+
datafusion-sql = "43"
43+
futures = "0.3"
44+
itertools = "0.13"
45+
log = "0.4"
46+
object_store = "0.11"
47+
48+
[dev-dependencies]
49+
anyhow = "1.0.95"
50+
env_logger = "0.11.6"
51+
tempfile = "3.14.0"
52+
tokio = "1.42.0"
53+
url = "2.5.4"

deny.toml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,13 @@
1616
# under the License.
1717

1818
[licenses]
19-
allow = ["Apache-2.0"]
19+
allow = [
20+
"Apache-2.0",
21+
"Apache-2.0 WITH LLVM-exception",
22+
"MIT",
23+
"BSD-2-Clause",
24+
"BSD-3-Clause",
25+
"CC0-1.0",
26+
"Unicode-3.0"
27+
]
2028
version = 2

src/lib.rs

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,22 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
/// Code for incremental view maintenance.
19-
mod materialized;
18+
#![deny(missing_docs)]
2019

21-
/// An implementation of Query Rewriting, an optimization that rewrites queries to make use of materialized views.
22-
mod rewrite;
20+
//! `datafusion-materialized-views` implements algorithms and functionality for materialized views in DataFusion.
21+
22+
/// Code for incremental view maintenance against Hive-partitioned tables.
23+
///
24+
/// An example of a Hive-partitioned table is the [`ListingTable`](datafusion::datasource::listing::ListingTable).
25+
/// By analyzing the fragment of the materialized view query pertaining to the partition columns,
26+
/// we can derive a build graph that relates the files of a materialized views and the files of the tables it depends on.
27+
///
28+
/// A central trait is defined for Hive-partitioned tables, [`ListingTableLike`](materialized::ListingTableLike). Note that
29+
/// all implementations of [`ListingTableLike`](materialized::ListingTableLike) must be registered using the
30+
/// [`register_listing_table`](materialized::register_listing_table) function, otherwise the tables may not be detected by
31+
/// the incremental view maintenance code, including components such as [`FileMetadata`](materialized::file_metadata::FileMetadata)
32+
/// or [`RowMetadataRegistry`](materialized::row_metadata::RowMetadataRegistry).
33+
pub mod materialized;
2334

24-
#[cfg(test)]
25-
mod test {
26-
#[test]
27-
fn test_it_works() {}
28-
}
35+
/// An implementation of Query Rewriting, an optimization that rewrites queries to make use of materialized views.
36+
pub mod rewrite;

src/materialized.rs

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,141 @@
1414
// KIND, either express or implied. See the License for the
1515
// specific language governing permissions and limitations
1616
// under the License.
17+
18+
/// Track dependencies of materialized data in object storage
19+
mod dependencies;
20+
21+
/// Pluggable metadata sources for incremental view maintenance
22+
pub mod row_metadata;
23+
24+
/// A virtual table that exposes files in object storage.
25+
pub mod file_metadata;
26+
27+
/// A UDF that parses Hive partition elements from object storage paths.
28+
mod hive_partition;
29+
30+
use std::{
31+
any::{type_name, Any, TypeId},
32+
fmt::Debug,
33+
sync::{Arc, LazyLock},
34+
};
35+
36+
use dashmap::DashMap;
37+
use datafusion::{
38+
catalog::TableProvider,
39+
datasource::listing::{ListingTable, ListingTableUrl},
40+
};
41+
use datafusion_expr::LogicalPlan;
42+
use itertools::Itertools;
43+
44+
/// The identifier of the column that [`RowMetadataSource`](row_metadata::RowMetadataSource) implementations should store row metadata in.
45+
pub const META_COLUMN: &str = "__meta";
46+
47+
static TABLE_TYPE_REGISTRY: LazyLock<TableTypeRegistry> = LazyLock::new(TableTypeRegistry::default);
48+
49+
/// A [`TableProvider`] whose data is backed by Hive-partitioned files in object storage.
50+
pub trait ListingTableLike: TableProvider + 'static {
51+
/// Object store URLs for this table
52+
fn table_paths(&self) -> Vec<ListingTableUrl>;
53+
54+
/// Hive partition columns
55+
fn partition_columns(&self) -> Vec<String>;
56+
57+
/// File extension used by this listing table
58+
fn file_ext(&self) -> String;
59+
}
60+
61+
impl ListingTableLike for ListingTable {
62+
fn table_paths(&self) -> Vec<ListingTableUrl> {
63+
self.table_paths().clone()
64+
}
65+
66+
fn partition_columns(&self) -> Vec<String> {
67+
self.options()
68+
.table_partition_cols
69+
.iter()
70+
.map(|(name, _data_type)| name.clone())
71+
.collect_vec()
72+
}
73+
74+
fn file_ext(&self) -> String {
75+
self.options().file_extension.clone()
76+
}
77+
}
78+
79+
/// Register a [`ListingTableLike`] implementation in this registry.
80+
/// This allows `cast_to_listing_table` to easily downcast a [`TableProvider`]
81+
/// into a [`ListingTableLike`] where possible.
82+
pub fn register_listing_table<T: ListingTableLike>() {
83+
TABLE_TYPE_REGISTRY.register_listing_table::<T>();
84+
}
85+
86+
/// Attempt to cast the given TableProvider into a [`ListingTableLike`].
87+
/// If the table's type has not been registered using [`register_listing_table`], will return `None`.
88+
pub fn cast_to_listing_table(table: &dyn TableProvider) -> Option<&dyn ListingTableLike> {
89+
TABLE_TYPE_REGISTRY.cast_to_listing_table(table)
90+
}
91+
92+
/// A hive-partitioned table in object storage that is defined by a user-provided query.
93+
pub trait Materialized: ListingTableLike {
94+
/// The query that defines this materialized view.
95+
fn query(&self) -> LogicalPlan;
96+
}
97+
98+
type Downcaster<T> = Arc<dyn Fn(&dyn Any) -> Option<&T> + Send + Sync>;
99+
100+
/// A registry for implementations of [`ListingTableLike`], used for downcasting
101+
/// arbitrary TableProviders into `dyn ListingTableLike` where possible.
102+
///
103+
/// This is used throughout the crate as a singleton to store all known implementations of `ListingTableLike`.
104+
/// By default, [`ListingTable`] is registered.
105+
struct TableTypeRegistry {
106+
listing_table_accessors: DashMap<TypeId, (&'static str, Downcaster<dyn ListingTableLike>)>,
107+
}
108+
109+
impl Debug for TableTypeRegistry {
110+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
111+
f.debug_struct("TableTypeRegistry")
112+
.field(
113+
"listing_table_accessors",
114+
&self
115+
.listing_table_accessors
116+
.iter()
117+
.map(|r| r.value().0)
118+
.collect_vec(),
119+
)
120+
.finish()
121+
}
122+
}
123+
124+
impl Default for TableTypeRegistry {
125+
fn default() -> Self {
126+
let new = Self {
127+
listing_table_accessors: DashMap::new(),
128+
};
129+
new.register_listing_table::<ListingTable>();
130+
131+
new
132+
}
133+
}
134+
135+
impl TableTypeRegistry {
136+
fn register_listing_table<T: ListingTableLike>(&self) {
137+
self.listing_table_accessors.insert(
138+
TypeId::of::<T>(),
139+
(
140+
type_name::<T>(),
141+
Arc::new(|any| any.downcast_ref::<T>().map(|t| t as &dyn ListingTableLike)),
142+
),
143+
);
144+
}
145+
146+
fn cast_to_listing_table<'a>(
147+
&'a self,
148+
table: &'a dyn TableProvider,
149+
) -> Option<&'a dyn ListingTableLike> {
150+
self.listing_table_accessors
151+
.get(&table.as_any().type_id())
152+
.and_then(|r| r.value().1(table.as_any()))
153+
}
154+
}

src/materialized/dependencies.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.

0 commit comments

Comments
 (0)