From 73c667216ee2fcb85196694cc5a36633f9928c19 Mon Sep 17 00:00:00 2001 From: Blake Orth Date: Tue, 16 Dec 2025 16:53:58 -0700 Subject: [PATCH 1/3] Enables DefaultListFilesCache by default - Sets the DefaultListFilesCache to be enabled by default - Adds additional object store access tests to show list caching behavior - Adds variable setting/reading sqllogic test cases - Updates tests to disable caching when they relied on COPY commands so changes can be detected for each query - Updates docs to help users upgrade --- .../tests/datasource/object_store_access.rs | 70 +++++++++++++++---- .../execution/src/cache/cache_manager.rs | 26 ++++--- .../sqllogictest/test_files/parquet.slt | 4 ++ .../test_files/repartition_scan.slt | 4 ++ .../sqllogictest/test_files/set_variable.slt | 18 +++++ docs/source/library-user-guide/upgrading.md | 14 +++- 6 files changed, 110 insertions(+), 26 deletions(-) diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs index 2e1b1484076d9..f72cbc937383c 100644 --- a/datafusion/core/tests/datasource/object_store_access.rs +++ b/datafusion/core/tests/datasource/object_store_access.rs @@ -117,15 +117,40 @@ async fn multi_query_multi_file_csv_file() { +---------+-------+-------+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 4 - - LIST prefix=data + Total Requests: 3 - GET (opts) path=data/file_0.csv - GET (opts) path=data/file_1.csv - GET (opts) path=data/file_2.csv " ); - // the second query should re-use the cached LIST results and should not reissue LIST + // Force a cache eviction by removing the data limit for the cache + assert_snapshot!( + test.query("set datafusion.runtime.list_files_cache_limit=\"0K\"").await, + @r" + ------- Query Output (0 rows) ------- + ++ + ++ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 0 + " + ); + + // Then re-enable the cache + assert_snapshot!( + test.query("set datafusion.runtime.list_files_cache_limit=\"1M\"").await, + @r" + ------- Query Output (0 rows) ------- + ++ + ++ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 0 + " + ); + + // this query should list the table since the cache entries were evicted assert_snapshot!( test.query("select * from csv_table").await, @r" @@ -149,6 +174,30 @@ async fn multi_query_multi_file_csv_file() { - GET (opts) path=data/file_2.csv " ); + + // this query should not list the table since the entries were added in the previous query + assert_snapshot!( + test.query("select * from csv_table").await, + @r" + ------- Query Output (6 rows) ------- + +---------+-------+-------+ + | c1 | c2 | c3 | + +---------+-------+-------+ + | 0.0 | 0.0 | true | + | 0.00003 | 5e-12 | false | + | 0.00001 | 1e-12 | true | + | 0.00003 | 5e-12 | false | + | 0.00002 | 2e-12 | true | + | 0.00003 | 5e-12 | false | + +---------+-------+-------+ + ------- Object Store Request Summary ------- + RequestCountingObjectStore() + Total Requests: 3 + - GET (opts) path=data/file_0.csv + - GET (opts) path=data/file_1.csv + - GET (opts) path=data/file_2.csv + " + ); } #[tokio::test] @@ -170,8 +219,7 @@ async fn query_multi_csv_file() { +---------+-------+-------+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 4 - - LIST prefix=data + Total Requests: 3 - GET (opts) path=data/file_0.csv - GET (opts) path=data/file_1.csv - GET (opts) path=data/file_2.csv @@ -198,8 +246,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 4 - - LIST prefix=data + Total Requests: 3 - GET (opts) path=data/a=1/b=10/c=100/file_1.csv - GET (opts) path=data/a=2/b=20/c=200/file_2.csv - GET (opts) path=data/a=3/b=30/c=300/file_3.csv @@ -236,8 +283,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 2 - - LIST prefix=data + Total Requests: 1 - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " ); @@ -254,8 +300,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 2 - - LIST prefix=data + Total Requests: 1 - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " ); @@ -290,8 +335,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 2 - - LIST prefix=data + Total Requests: 1 - GET (opts) path=data/a=1/b=10/c=100/file_1.csv " ); diff --git a/datafusion/execution/src/cache/cache_manager.rs b/datafusion/execution/src/cache/cache_manager.rs index 2df5ef1b4458c..e45e226daf594 100644 --- a/datafusion/execution/src/cache/cache_manager.rs +++ b/datafusion/execution/src/cache/cache_manager.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::cache::CacheAccessor; use crate::cache::cache_unit::DefaultFilesMetadataCache; +use crate::cache::{CacheAccessor, DefaultListFilesCache}; use datafusion_common::stats::Precision; use datafusion_common::{Result, Statistics}; use object_store::ObjectMeta; @@ -178,15 +178,21 @@ impl CacheManager { let file_statistic_cache = config.table_files_statistics_cache.as_ref().map(Arc::clone); - let list_files_cache = config - .list_files_cache - .as_ref() - .inspect(|c| { - // the cache memory limit or ttl might have changed, ensure they are updated - c.update_cache_limit(config.list_files_cache_limit); - c.update_cache_ttl(config.list_files_cache_ttl); - }) - .map(Arc::clone); + let list_files_cache = match &config.list_files_cache { + Some(lfc) if config.list_files_cache_limit > 0 => { + lfc.update_cache_limit(config.list_files_cache_limit); + lfc.update_cache_ttl(config.list_files_cache_ttl); + Some(Arc::clone(lfc)) + } + None if config.list_files_cache_limit > 0 => { + let lfc: Arc = Arc::new(DefaultListFilesCache::new( + config.list_files_cache_limit, + config.list_files_cache_ttl, + )); + Some(lfc) + } + _ => None, + }; let file_metadata_cache = config .file_metadata_cache diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index c786f7bdc77cb..be713b963b451 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -21,6 +21,10 @@ statement ok set datafusion.execution.target_partitions = 2; +# disable the listing cache so DataFusion picks up changes from COPY statements +statement ok +set datafusion.runtime.list_files_cache_limit = "0K"; + # Create a table as a data source statement ok CREATE TABLE src_table ( diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 06ea22761d92b..c9c2f91257081 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -27,6 +27,10 @@ set datafusion.execution.target_partitions = 4; statement ok set datafusion.optimizer.repartition_file_min_size = 1; +# disable the listing cache so DataFusion picks up changes from COPY statements +statement ok +set datafusion.runtime.list_files_cache_limit = "0K"; + ################### ### Parquet tests ################### diff --git a/datafusion/sqllogictest/test_files/set_variable.slt b/datafusion/sqllogictest/test_files/set_variable.slt index 8957404799b73..c444128b18f4f 100644 --- a/datafusion/sqllogictest/test_files/set_variable.slt +++ b/datafusion/sqllogictest/test_files/set_variable.slt @@ -416,6 +416,24 @@ SHOW datafusion.runtime.metadata_cache_limit ---- datafusion.runtime.metadata_cache_limit 200M +# Test SET and SHOW runtime.list_files_cache_limit +statement ok +SET datafusion.runtime.list_files_cache_limit = '2M' + +query TT +SHOW datafusion.runtime.list_files_cache_limit +---- +datafusion.runtime.list_files_cache_limit 2M + +# Test SET and SHOW runtime.list_files_cache_ttl +statement ok +SET datafusion.runtime.list_files_cache_ttl = '90s' + +query TT +SHOW datafusion.runtime.list_files_cache_ttl +---- +datafusion.runtime.list_files_cache_ttl 1m30s + # Note: runtime.temp_directory shows the actual temp directory path with a unique suffix, # so we cannot test the exact value. We verify it exists in information_schema instead. diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 159bd3e4e790e..19467e28fa42d 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -45,15 +45,23 @@ directly on the `Field`. For example: In prior versions, `ListingTableProvider` would issue `LIST` commands to the underlying object store each time it needed to list files for a query. To improve performance, `ListingTableProvider` now caches the results of -`LIST` commands for the lifetime of the `ListingTableProvider` instance. +`LIST` commands for the lifetime of the `ListingTableProvider` instance or +until a cache entry expires. Note that by default the cache has no expiration time, so if files are added or removed from the underlying object store, the `ListingTableProvider` will not see those changes until the `ListingTableProvider` instance is dropped and recreated. -You will be able to configure the maximum cache size and cache expiration time via a configuration option: +You can configure the maximum cache size and cache entry expiration time via configuration options: -See for more details. +`datafusion.runtime.list_files_cache_limit` +`datafusion.runtime.list_files_cache_ttl` + +Caching can be disable by setting the limit to 0: + +```sql +SET datafusion.runtime.list_files_cache_limit TO "0K"; +``` Note that the internal API has changed to use a trait `ListFilesCache` instead of a type alias. From da08e25c448b546c2a9fc69787f4266c46bf99fa Mon Sep 17 00:00:00 2001 From: Blake Orth Date: Wed, 17 Dec 2025 15:05:03 -0700 Subject: [PATCH 2/3] - Better formatting and additional information for docs - Fixes erroneous cache limit value being returned if the user has disabled the cache --- datafusion/execution/src/cache/cache_manager.rs | 2 +- docs/source/library-user-guide/upgrading.md | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/datafusion/execution/src/cache/cache_manager.rs b/datafusion/execution/src/cache/cache_manager.rs index e45e226daf594..473009f2badf6 100644 --- a/datafusion/execution/src/cache/cache_manager.rs +++ b/datafusion/execution/src/cache/cache_manager.rs @@ -226,7 +226,7 @@ impl CacheManager { pub fn get_list_files_cache_limit(&self) -> usize { self.list_files_cache .as_ref() - .map_or(DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, |c| c.cache_limit()) + .map_or(0, |c| c.cache_limit()) } /// Get the TTL (time-to-live) of the list files cache. diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 19467e28fa42d..722d269ac68ad 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -54,10 +54,13 @@ those changes until the `ListingTableProvider` instance is dropped and recreated You can configure the maximum cache size and cache entry expiration time via configuration options: -`datafusion.runtime.list_files_cache_limit` -`datafusion.runtime.list_files_cache_ttl` +- `datafusion.runtime.list_files_cache_limit` - Limits the size of the cache in bytes +- `datafusion.runtime.list_files_cache_ttl` - Limits the TTL (time-to-live) of an entry in seconds -Caching can be disable by setting the limit to 0: +Detailed configuration information can be found in the [DataFusion Runtime +Configuration](https://datafusion.apache.org/user-guide/configs.html#runtime-configuration-settings) user's guide. + +Caching can be disabled by setting the limit to 0: ```sql SET datafusion.runtime.list_files_cache_limit TO "0K"; From de5ec9f6bb23ad5831ef7d45e6ed2abb46792b87 Mon Sep 17 00:00:00 2001 From: Blake Orth Date: Wed, 17 Dec 2025 17:07:54 -0700 Subject: [PATCH 3/3] - Updates more tests for caching improvements from upstream updates --- datafusion/core/tests/datasource/object_store_access.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs index f72cbc937383c..561de21520394 100644 --- a/datafusion/core/tests/datasource/object_store_access.rs +++ b/datafusion/core/tests/datasource/object_store_access.rs @@ -265,8 +265,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 2 - - LIST prefix=data/a=2 + Total Requests: 1 - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " ); @@ -317,8 +316,7 @@ async fn query_partitioned_csv_file() { +---------+-------+-------+---+----+-----+ ------- Object Store Request Summary ------- RequestCountingObjectStore() - Total Requests: 2 - - LIST prefix=data/a=2/b=20 + Total Requests: 1 - GET (opts) path=data/a=2/b=20/c=200/file_2.csv " );