Skip to content

Commit 0808f3a

Browse files
authored
Improvements to list_files_cache table function (apache#19703)
## Which issue does this PR close? - Follow on to apache#19616 ## Rationale for this change I had a few minor comments / suggestions while reviewing apache#19616 from @jizezhang but they weren't needed to do the initial merge, so I would like to propose them in a follow up PR ## What changes are included in this PR? 1. Improve documentation 2. Improve handling of `table_ref` in ListingTableURL 3. use Null rather than `"NULL"` in `list_files_cache` table function I can break this into separate PRs if that would help ## Are these changes tested? Yes by CI ## Are there any user-facing changes? The `list_files_cache` function now might return null
1 parent 154ab70 commit 0808f3a

File tree

5 files changed

+22
-13
lines changed

5 files changed

+22
-13
lines changed

datafusion-cli/src/functions.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -703,10 +703,13 @@ impl TableFunctionImpl for StatisticsCacheFunc {
703703
}
704704
}
705705

706-
// Implementation of the `list_files_cache` table function in datafusion-cli.
706+
/// Implementation of the `list_files_cache` table function in datafusion-cli.
707+
///
708+
/// This function returns the cached results of running a LIST command on a
709+
/// particular object store path for a table. The object metadata is returned as
710+
/// a List of Structs, with one Struct for each object. DataFusion uses these
711+
/// cached results to plan queries against external tables.
707712
///
708-
/// This function returns the cached results of running a LIST command on a particular object store path for a table. The object metadata is returned as a List of Structs, with one Struct for each object.
709-
/// DataFusion uses these cached results to plan queries against external tables.
710713
/// # Schema
711714
/// ```sql
712715
/// > describe select * from list_files_cache();
@@ -788,7 +791,7 @@ impl TableFunctionImpl for ListFilesCacheFunc {
788791
Field::new("metadata", DataType::Struct(nested_fields.clone()), true);
789792

790793
let schema = Arc::new(Schema::new(vec![
791-
Field::new("table", DataType::Utf8, false),
794+
Field::new("table", DataType::Utf8, true),
792795
Field::new("path", DataType::Utf8, false),
793796
Field::new("metadata_size_bytes", DataType::UInt64, false),
794797
// expires field in ListFilesEntry has type Instant when set, from which we cannot get "the number of seconds", hence using Duration instead of Timestamp as data type.
@@ -821,7 +824,7 @@ impl TableFunctionImpl for ListFilesCacheFunc {
821824
let mut current_offset: i32 = 0;
822825

823826
for (path, entry) in list_files_cache.list_entries() {
824-
table_arr.push(path.table.map_or("NULL".to_string(), |t| t.to_string()));
827+
table_arr.push(path.table.map(|t| t.to_string()));
825828
path_arr.push(path.path.to_string());
826829
metadata_size_bytes_arr.push(entry.size_bytes as u64);
827830
// calculates time left before entry expires

datafusion/core/src/datasource/listing_table_factory.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,7 @@ impl TableProviderFactory for ListingTableFactory {
161161
}
162162
None => format!("*.{}", cmd.file_type.to_lowercase()),
163163
};
164-
table_path = table_path
165-
.with_glob(glob.as_ref())?
166-
.with_table_ref(cmd.name.clone());
164+
table_path = table_path.with_glob(glob.as_ref())?;
167165
}
168166
let schema = options.infer_schema(session_state, &table_path).await?;
169167
let df_schema = Arc::clone(&schema).to_dfschema()?;

datafusion/datasource/src/url.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ pub struct ListingTableUrl {
4343
prefix: Path,
4444
/// An optional glob expression used to filter files
4545
glob: Option<Pattern>,
46-
46+
/// Optional table reference for the table this url belongs to
4747
table_ref: Option<TableReference>,
4848
}
4949

@@ -341,17 +341,19 @@ impl ListingTableUrl {
341341
}
342342

343343
/// Returns a copy of current [`ListingTableUrl`] with a specified `glob`
344-
pub fn with_glob(self, glob: &str) -> Result<Self> {
345-
let glob =
346-
Pattern::new(glob).map_err(|e| DataFusionError::External(Box::new(e)))?;
347-
Self::try_new(self.url, Some(glob))
344+
pub fn with_glob(mut self, glob: &str) -> Result<Self> {
345+
self.glob =
346+
Some(Pattern::new(glob).map_err(|e| DataFusionError::External(Box::new(e)))?);
347+
Ok(self)
348348
}
349349

350+
/// Set the table reference for this [`ListingTableUrl`]
350351
pub fn with_table_ref(mut self, table_ref: TableReference) -> Self {
351352
self.table_ref = Some(table_ref);
352353
self
353354
}
354355

356+
/// Return the table reference for this [`ListingTableUrl`]
355357
pub fn get_table_ref(&self) -> &Option<TableReference> {
356358
&self.table_ref
357359
}

datafusion/execution/src/cache/cache_manager.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ pub trait ListFilesCache: CacheAccessor<TableScopedPath, CachedFileList> {
196196
/// Retrieves the information about the entries currently cached.
197197
fn list_entries(&self) -> HashMap<TableScopedPath, ListFilesEntry>;
198198

199+
/// Drop all entries for the given table reference.
199200
fn drop_table_entries(&self, table_ref: &Option<TableReference>) -> Result<()>;
200201
}
201202

datafusion/execution/src/cache/list_files_cache.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ pub const DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT: usize = 1024 * 1024; // 1MiB
139139
/// The default cache TTL for the [`DefaultListFilesCache`]
140140
pub const DEFAULT_LIST_FILES_CACHE_TTL: Option<Duration> = None; // Infinite
141141

142+
/// Key for [`DefaultListFilesCache`]
143+
///
144+
/// Each entry is scoped to its use within a specific table so that the cache
145+
/// can differentiate between identical paths in different tables, and
146+
/// table-level cache invalidation.
142147
#[derive(PartialEq, Eq, Hash, Clone, Debug)]
143148
pub struct TableScopedPath {
144149
pub table: Option<TableReference>,

0 commit comments

Comments
 (0)