Skip to content

Commit 13acb2f

Browse files
committed
feat: introduce VacuumMode::Full for cleaning up orphaned files
This allows an optional but not-on-by-default mode of removing untracked files in the delta table directory. Delta/Spark supports a "lite" and "full" mode for [vacuum]. This change is intentionally not making "full" the default as it is for Delta/Spark since that may have unintended consequences for our users who have become accustomed to "lite" being the default. Fixes #2349 [vacuum]: https://docs.delta.io/latest/delta-utility.html#remove-files-no-longer-referenced-by-a-delta-table Signed-off-by: R. Tyler Croy <[email protected]>
1 parent e47bba5 commit 13acb2f

File tree

4 files changed

+99
-13
lines changed

4 files changed

+99
-13
lines changed

crates/core/src/operations/vacuum.rs

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ use futures::{StreamExt, TryStreamExt};
3131
use object_store::Error;
3232
use object_store::{path::Path, ObjectStore};
3333
use serde::Serialize;
34+
use tracing::log::*;
3435

3536
use super::{CustomExecuteHandler, Operation};
3637
use crate::errors::{DeltaResult, DeltaTableError};
@@ -76,6 +77,20 @@ pub trait Clock: Debug + Send + Sync {
7677
fn current_timestamp_millis(&self) -> i64;
7778
}
7879

80+
/// Type of Vacuum operation to perform
81+
#[derive(Debug, Default, Clone, PartialEq)]
82+
pub enum VacuumMode {
83+
/// The `lite` mode will only remove files which are referenced in the `_delta_log` associated
84+
/// with `remove` action
85+
#[default]
86+
Lite,
87+
/// A `full` mode vacuum will remove _all_ data files no longer actively referenced in the
88+
/// `_delta_log` table. For example, if parquet files exist in the table directory but are no
89+
/// longer mentioned as `add` actions in the transaction log, then this mode will scan storage
90+
/// and remove those files.
91+
Full,
92+
}
93+
7994
/// Vacuum a Delta table with the given options
8095
/// See this module's documentation for more information
8196
pub struct VacuumBuilder {
@@ -89,6 +104,8 @@ pub struct VacuumBuilder {
89104
enforce_retention_duration: bool,
90105
/// Don't delete the files. Just determine which files can be deleted
91106
dry_run: bool,
107+
/// Mode of vacuum that should be run
108+
mode: VacuumMode,
92109
/// Override the source of time
93110
clock: Option<Arc<dyn Clock>>,
94111
/// Additional information to add to the commit
@@ -144,6 +161,7 @@ impl VacuumBuilder {
144161
retention_period: None,
145162
enforce_retention_duration: true,
146163
dry_run: false,
164+
mode: VacuumMode::Lite,
147165
clock: None,
148166
commit_properties: CommitProperties::default(),
149167
custom_execute_handler: None,
@@ -156,6 +174,12 @@ impl VacuumBuilder {
156174
self
157175
}
158176

177+
/// Override the default vacuum mode (lite)
178+
pub fn with_mode(mut self, mode: VacuumMode) -> Self {
179+
self.mode = mode;
180+
self
181+
}
182+
159183
/// Only determine which files should be deleted
160184
pub fn with_dry_run(mut self, dry_run: bool) -> Self {
161185
self.dry_run = dry_run;
@@ -189,6 +213,10 @@ impl VacuumBuilder {
189213

190214
/// Determine which files can be deleted. Does not actually perform the deletion
191215
async fn create_vacuum_plan(&self) -> Result<VacuumPlan, VacuumError> {
216+
if self.mode == VacuumMode::Full {
217+
info!("Vacuum configured to run with 'VacuumMode::Full'. It will scan for orphaned parquet files in the Delta table directory and remove those as well!");
218+
}
219+
192220
let min_retention = Duration::milliseconds(
193221
self.snapshot
194222
.table_config()
@@ -228,12 +256,24 @@ impl VacuumBuilder {
228256
while let Some(obj_meta) = all_files.next().await {
229257
// TODO should we allow NotFound here in case we have a temporary commit file in the list
230258
let obj_meta = obj_meta.map_err(DeltaTableError::from)?;
231-
if valid_files.contains(&obj_meta.location) // file is still being tracked in table
232-
|| !expired_tombstones.contains(obj_meta.location.as_ref()) // file is not an expired tombstone
233-
|| is_hidden_directory(partition_columns, &obj_meta.location)?
234-
{
259+
// file is still being tracked in table
260+
if valid_files.contains(&obj_meta.location) {
235261
continue;
236262
}
263+
if is_hidden_directory(partition_columns, &obj_meta.location)? {
264+
continue;
265+
}
266+
// file is not an expired tombstone _and_ this is a "Lite" vacuum
267+
// If the file is not an expired tombstone and we have gotten to here with a
268+
// VacuumMode::Full then it should be added to the deletion plan
269+
if !expired_tombstones.contains(obj_meta.location.as_ref()) {
270+
if self.mode == VacuumMode::Lite {
271+
debug!("The file {:?} was not referenced in a log file, but VacuumMode::Lite means it will not be vacuumed", &obj_meta.location);
272+
continue;
273+
} else {
274+
debug!("The file {:?} was not referenced in a log file, but VacuumMode::Full means it *will be vacuumed*", &obj_meta.location);
275+
}
276+
}
237277

238278
files_to_delete.push(obj_meta.location);
239279
file_sizes.push(obj_meta.size as i64);
@@ -436,7 +476,44 @@ mod tests {
436476
use std::time::SystemTime;
437477

438478
#[tokio::test]
439-
async fn vacuum_delta_8_0_table() {
479+
async fn test_vacuum_full() -> DeltaResult<()> {
480+
let table = open_table("../test/tests/data/simple_commit").await?;
481+
482+
let (_table, result) = VacuumBuilder::new(table.log_store(), table.snapshot()?.clone())
483+
.with_retention_period(Duration::hours(0))
484+
.with_dry_run(true)
485+
.with_mode(VacuumMode::Lite)
486+
.with_enforce_retention_duration(false)
487+
.await?;
488+
// When running lite, this table with superfluous parquet files should not have anything to
489+
// delete
490+
assert!(result.files_deleted.is_empty());
491+
492+
let (_table, result) = VacuumBuilder::new(table.log_store(), table.snapshot()?.clone())
493+
.with_retention_period(Duration::hours(0))
494+
.with_dry_run(true)
495+
.with_mode(VacuumMode::Full)
496+
.with_enforce_retention_duration(false)
497+
.await?;
498+
let mut files_deleted = result.files_deleted.clone();
499+
files_deleted.sort();
500+
// When running with full, these superfluous parquet files which are not actually
501+
// referenced in the _delta_log commits should be considered for the
502+
// low-orbit ion-cannon
503+
assert_eq!(
504+
files_deleted,
505+
vec![
506+
"part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet",
507+
"part-00000-b44fcdb0-8b06-4f3a-8606-f8311a96f6dc-c000.snappy.parquet",
508+
"part-00001-185eca06-e017-4dea-ae49-fc48b973e37e-c000.snappy.parquet",
509+
"part-00001-4327c977-2734-4477-9507-7ccf67924649-c000.snappy.parquet",
510+
]
511+
);
512+
Ok(())
513+
}
514+
515+
#[tokio::test]
516+
async fn vacuum_delta_8_0_table() -> DeltaResult<()> {
440517
let table = open_table("../test/tests/data/delta-0.8.0").await.unwrap();
441518

442519
let result = VacuumBuilder::new(table.log_store(), table.snapshot().unwrap().clone())
@@ -453,8 +530,7 @@ mod tests {
453530
.with_retention_period(Duration::hours(0))
454531
.with_dry_run(true)
455532
.with_enforce_retention_duration(false)
456-
.await
457-
.unwrap();
533+
.await?;
458534
// do not enforce retention duration check with 0 hour will purge all files
459535
assert_eq!(
460536
result.files_deleted,
@@ -465,8 +541,7 @@ mod tests {
465541
VacuumBuilder::new(table.log_store(), table.snapshot().unwrap().clone())
466542
.with_retention_period(Duration::hours(169))
467543
.with_dry_run(true)
468-
.await
469-
.unwrap();
544+
.await?;
470545

471546
assert_eq!(
472547
result.files_deleted,
@@ -483,9 +558,9 @@ mod tests {
483558
VacuumBuilder::new(table.log_store(), table.snapshot().unwrap().clone())
484559
.with_retention_period(Duration::hours(retention_hours as i64))
485560
.with_dry_run(true)
486-
.await
487-
.unwrap();
561+
.await?;
488562

489563
assert_eq!(result.files_deleted, empty);
564+
Ok(())
490565
}
491566
}

python/deltalake/_internal.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ class RawDeltaTable:
104104
enforce_retention_duration: bool,
105105
commit_properties: CommitProperties | None,
106106
post_commithook_properties: PostCommitHookProperties | None,
107+
full: bool,
107108
) -> list[str]: ...
108109
def compact_optimize(
109110
self,

python/deltalake/table.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,7 @@ def vacuum(
516516
enforce_retention_duration: bool = True,
517517
post_commithook_properties: PostCommitHookProperties | None = None,
518518
commit_properties: CommitProperties | None = None,
519+
full: bool = False,
519520
) -> list[str]:
520521
"""
521522
Run the Vacuum command on the Delta Table: list and delete files no longer referenced by the Delta table and are older than the retention threshold.
@@ -526,6 +527,7 @@ def vacuum(
526527
enforce_retention_duration: when disabled, accepts retention hours smaller than the value from `delta.deletedFileRetentionDuration`.
527528
post_commithook_properties: properties for the post commit hook. If None, default values are used.
528529
commit_properties: properties of the transaction commit. If None, default values are used.
530+
full: when set to True, will perform a "full" vacuum and remove all files not referenced in the transaction log
529531
Returns:
530532
the list of files no longer referenced by the Delta Table and are older than the retention threshold.
531533
"""
@@ -539,6 +541,7 @@ def vacuum(
539541
enforce_retention_duration,
540542
commit_properties,
541543
post_commithook_properties,
544+
full,
542545
)
543546

544547
def update(

python/src/lib.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ use deltalake::operations::restore::RestoreBuilder;
5353
use deltalake::operations::set_tbl_properties::SetTablePropertiesBuilder;
5454
use deltalake::operations::update::UpdateBuilder;
5555
use deltalake::operations::update_field_metadata::UpdateFieldMetadataBuilder;
56-
use deltalake::operations::vacuum::VacuumBuilder;
56+
use deltalake::operations::vacuum::{VacuumBuilder, VacuumMode};
5757
use deltalake::operations::write::WriteBuilder;
5858
use deltalake::operations::CustomExecuteHandler;
5959
use deltalake::parquet::basic::Compression;
@@ -477,7 +477,8 @@ impl RawDeltaTable {
477477

478478
/// Run the Vacuum command on the Delta Table: list and delete files no longer referenced
479479
/// by the Delta table and are older than the retention threshold.
480-
#[pyo3(signature = (dry_run, retention_hours = None, enforce_retention_duration = true, commit_properties=None, post_commithook_properties=None))]
480+
#[pyo3(signature = (dry_run, retention_hours = None, enforce_retention_duration = true, commit_properties=None, post_commithook_properties=None, full = false))]
481+
#[allow(clippy::too_many_arguments)]
481482
pub fn vacuum(
482483
&self,
483484
py: Python,
@@ -486,6 +487,7 @@ impl RawDeltaTable {
486487
enforce_retention_duration: bool,
487488
commit_properties: Option<PyCommitProperties>,
488489
post_commithook_properties: Option<PyPostCommitHookProperties>,
490+
full: bool,
489491
) -> PyResult<Vec<String>> {
490492
let (table, metrics) = py.allow_threads(|| {
491493
let snapshot = match self._table.lock() {
@@ -499,6 +501,11 @@ impl RawDeltaTable {
499501
let mut cmd = VacuumBuilder::new(self.log_store()?, snapshot)
500502
.with_enforce_retention_duration(enforce_retention_duration)
501503
.with_dry_run(dry_run);
504+
505+
if full {
506+
cmd = cmd.with_mode(VacuumMode::Full);
507+
}
508+
502509
if let Some(retention_period) = retention_hours {
503510
cmd = cmd.with_retention_period(Duration::hours(retention_period as i64));
504511
}

0 commit comments

Comments
 (0)