Skip to content

Commit 73c6672

Browse files
committed
Enables DefaultListFilesCache by default
- Sets the DefaultListFilesCache to be enabled by default - Adds additional object store access tests to show list caching behavior - Adds variable setting/reading sqllogic test cases - Updates tests to disable caching when they relied on COPY commands so changes can be detected for each query - Updates docs to help users upgrade
1 parent efd793b commit 73c6672

File tree

6 files changed

+110
-26
lines changed

6 files changed

+110
-26
lines changed

datafusion/core/tests/datasource/object_store_access.rs

Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,40 @@ async fn multi_query_multi_file_csv_file() {
117117
+---------+-------+-------+
118118
------- Object Store Request Summary -------
119119
RequestCountingObjectStore()
120-
Total Requests: 4
121-
- LIST prefix=data
120+
Total Requests: 3
122121
- GET (opts) path=data/file_0.csv
123122
- GET (opts) path=data/file_1.csv
124123
- GET (opts) path=data/file_2.csv
125124
"
126125
);
127126

128-
// the second query should re-use the cached LIST results and should not reissue LIST
127+
// Force a cache eviction by removing the data limit for the cache
128+
assert_snapshot!(
129+
test.query("set datafusion.runtime.list_files_cache_limit=\"0K\"").await,
130+
@r"
131+
------- Query Output (0 rows) -------
132+
++
133+
++
134+
------- Object Store Request Summary -------
135+
RequestCountingObjectStore()
136+
Total Requests: 0
137+
"
138+
);
139+
140+
// Then re-enable the cache
141+
assert_snapshot!(
142+
test.query("set datafusion.runtime.list_files_cache_limit=\"1M\"").await,
143+
@r"
144+
------- Query Output (0 rows) -------
145+
++
146+
++
147+
------- Object Store Request Summary -------
148+
RequestCountingObjectStore()
149+
Total Requests: 0
150+
"
151+
);
152+
153+
// this query should list the table since the cache entries were evicted
129154
assert_snapshot!(
130155
test.query("select * from csv_table").await,
131156
@r"
@@ -149,6 +174,30 @@ async fn multi_query_multi_file_csv_file() {
149174
- GET (opts) path=data/file_2.csv
150175
"
151176
);
177+
178+
// this query should not list the table since the entries were added in the previous query
179+
assert_snapshot!(
180+
test.query("select * from csv_table").await,
181+
@r"
182+
------- Query Output (6 rows) -------
183+
+---------+-------+-------+
184+
| c1 | c2 | c3 |
185+
+---------+-------+-------+
186+
| 0.0 | 0.0 | true |
187+
| 0.00003 | 5e-12 | false |
188+
| 0.00001 | 1e-12 | true |
189+
| 0.00003 | 5e-12 | false |
190+
| 0.00002 | 2e-12 | true |
191+
| 0.00003 | 5e-12 | false |
192+
+---------+-------+-------+
193+
------- Object Store Request Summary -------
194+
RequestCountingObjectStore()
195+
Total Requests: 3
196+
- GET (opts) path=data/file_0.csv
197+
- GET (opts) path=data/file_1.csv
198+
- GET (opts) path=data/file_2.csv
199+
"
200+
);
152201
}
153202

154203
#[tokio::test]
@@ -170,8 +219,7 @@ async fn query_multi_csv_file() {
170219
+---------+-------+-------+
171220
------- Object Store Request Summary -------
172221
RequestCountingObjectStore()
173-
Total Requests: 4
174-
- LIST prefix=data
222+
Total Requests: 3
175223
- GET (opts) path=data/file_0.csv
176224
- GET (opts) path=data/file_1.csv
177225
- GET (opts) path=data/file_2.csv
@@ -198,8 +246,7 @@ async fn query_partitioned_csv_file() {
198246
+---------+-------+-------+---+----+-----+
199247
------- Object Store Request Summary -------
200248
RequestCountingObjectStore()
201-
Total Requests: 4
202-
- LIST prefix=data
249+
Total Requests: 3
203250
- GET (opts) path=data/a=1/b=10/c=100/file_1.csv
204251
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
205252
- GET (opts) path=data/a=3/b=30/c=300/file_3.csv
@@ -236,8 +283,7 @@ async fn query_partitioned_csv_file() {
236283
+---------+-------+-------+---+----+-----+
237284
------- Object Store Request Summary -------
238285
RequestCountingObjectStore()
239-
Total Requests: 2
240-
- LIST prefix=data
286+
Total Requests: 1
241287
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
242288
"
243289
);
@@ -254,8 +300,7 @@ async fn query_partitioned_csv_file() {
254300
+---------+-------+-------+---+----+-----+
255301
------- Object Store Request Summary -------
256302
RequestCountingObjectStore()
257-
Total Requests: 2
258-
- LIST prefix=data
303+
Total Requests: 1
259304
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
260305
"
261306
);
@@ -290,8 +335,7 @@ async fn query_partitioned_csv_file() {
290335
+---------+-------+-------+---+----+-----+
291336
------- Object Store Request Summary -------
292337
RequestCountingObjectStore()
293-
Total Requests: 2
294-
- LIST prefix=data
338+
Total Requests: 1
295339
- GET (opts) path=data/a=1/b=10/c=100/file_1.csv
296340
"
297341
);

datafusion/execution/src/cache/cache_manager.rs

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::cache::CacheAccessor;
1918
use crate::cache::cache_unit::DefaultFilesMetadataCache;
19+
use crate::cache::{CacheAccessor, DefaultListFilesCache};
2020
use datafusion_common::stats::Precision;
2121
use datafusion_common::{Result, Statistics};
2222
use object_store::ObjectMeta;
@@ -178,15 +178,21 @@ impl CacheManager {
178178
let file_statistic_cache =
179179
config.table_files_statistics_cache.as_ref().map(Arc::clone);
180180

181-
let list_files_cache = config
182-
.list_files_cache
183-
.as_ref()
184-
.inspect(|c| {
185-
// the cache memory limit or ttl might have changed, ensure they are updated
186-
c.update_cache_limit(config.list_files_cache_limit);
187-
c.update_cache_ttl(config.list_files_cache_ttl);
188-
})
189-
.map(Arc::clone);
181+
let list_files_cache = match &config.list_files_cache {
182+
Some(lfc) if config.list_files_cache_limit > 0 => {
183+
lfc.update_cache_limit(config.list_files_cache_limit);
184+
lfc.update_cache_ttl(config.list_files_cache_ttl);
185+
Some(Arc::clone(lfc))
186+
}
187+
None if config.list_files_cache_limit > 0 => {
188+
let lfc: Arc<dyn ListFilesCache> = Arc::new(DefaultListFilesCache::new(
189+
config.list_files_cache_limit,
190+
config.list_files_cache_ttl,
191+
));
192+
Some(lfc)
193+
}
194+
_ => None,
195+
};
190196

191197
let file_metadata_cache = config
192198
.file_metadata_cache

datafusion/sqllogictest/test_files/parquet.slt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
statement ok
2222
set datafusion.execution.target_partitions = 2;
2323

24+
# disable the listing cache so DataFusion picks up changes from COPY statements
25+
statement ok
26+
set datafusion.runtime.list_files_cache_limit = "0K";
27+
2428
# Create a table as a data source
2529
statement ok
2630
CREATE TABLE src_table (

datafusion/sqllogictest/test_files/repartition_scan.slt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ set datafusion.execution.target_partitions = 4;
2727
statement ok
2828
set datafusion.optimizer.repartition_file_min_size = 1;
2929

30+
# disable the listing cache so DataFusion picks up changes from COPY statements
31+
statement ok
32+
set datafusion.runtime.list_files_cache_limit = "0K";
33+
3034
###################
3135
### Parquet tests
3236
###################

datafusion/sqllogictest/test_files/set_variable.slt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,24 @@ SHOW datafusion.runtime.metadata_cache_limit
416416
----
417417
datafusion.runtime.metadata_cache_limit 200M
418418

419+
# Test SET and SHOW runtime.list_files_cache_limit
420+
statement ok
421+
SET datafusion.runtime.list_files_cache_limit = '2M'
422+
423+
query TT
424+
SHOW datafusion.runtime.list_files_cache_limit
425+
----
426+
datafusion.runtime.list_files_cache_limit 2M
427+
428+
# Test SET and SHOW runtime.list_files_cache_ttl
429+
statement ok
430+
SET datafusion.runtime.list_files_cache_ttl = '90s'
431+
432+
query TT
433+
SHOW datafusion.runtime.list_files_cache_ttl
434+
----
435+
datafusion.runtime.list_files_cache_ttl 1m30s
436+
419437
# Note: runtime.temp_directory shows the actual temp directory path with a unique suffix,
420438
# so we cannot test the exact value. We verify it exists in information_schema instead.
421439

docs/source/library-user-guide/upgrading.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,23 @@ directly on the `Field`. For example:
4545
In prior versions, `ListingTableProvider` would issue `LIST` commands to
4646
the underlying object store each time it needed to list files for a query.
4747
To improve performance, `ListingTableProvider` now caches the results of
48-
`LIST` commands for the lifetime of the `ListingTableProvider` instance.
48+
`LIST` commands for the lifetime of the `ListingTableProvider` instance or
49+
until a cache entry expires.
4950

5051
Note that by default the cache has no expiration time, so if files are added or removed
5152
from the underlying object store, the `ListingTableProvider` will not see
5253
those changes until the `ListingTableProvider` instance is dropped and recreated.
5354

54-
You will be able to configure the maximum cache size and cache expiration time via a configuration option:
55+
You can configure the maximum cache size and cache entry expiration time via configuration options:
5556

56-
See <https://github.com/apache/datafusion/issues/19056> for more details.
57+
`datafusion.runtime.list_files_cache_limit`
58+
`datafusion.runtime.list_files_cache_ttl`
59+
60+
Caching can be disable by setting the limit to 0:
61+
62+
```sql
63+
SET datafusion.runtime.list_files_cache_limit TO "0K";
64+
```
5765

5866
Note that the internal API has changed to use a trait `ListFilesCache` instead of a type alias.
5967

0 commit comments

Comments
 (0)