Add Delta Compact Function (#142)

MathiasHolmstrom · web-flow · commit 4b4801c53d5f · 2024-11-28T15:03:33.000+01:00
diff --git a/spark_utils/delta_lake/functions.py b/spark_utils/delta_lake/functions.py
@@ -24,7 +24,8 @@
   Helper functions for Delta Lake
 """
 import re
-from typing import Iterator
+from typing import Iterator, Optional
+
 
 from delta import DeltaTable
 from pyspark.sql import SparkSession
@@ -157,3 +158,66 @@ def get_table_info(
         return [table_col for table_col in cols if table_col.name not in parts_names], parts, table_path
 
     return cols, [], table_path
+
+
+def delta_compact(
+    spark_session: SparkSession,
+    path: str,
+    retain_hours: float = 48,
+    compact_from_predicate: Optional[str] = None,
+    target_file_size_bytes: Optional[int] = None,
+    vacuum_only: bool = True,
+    refresh_cache=False,
+) -> None:
+    """
+      Runs bin-packing optimization to reduce number of files/increase average file size in the table physical storage.
+      Refreshes delta cache after opt/vacuum have been finished.
+      https://docs.delta.io/latest/optimizations-oss.html#optimizations
+
+    :param spark_session: Spark session that will perform the operation.
+    :param path: Path to delta table, filesystem or hive.
+    :param retain_hours: Age of data to retain, defaults to 48 hours.
+    :param compact_from_predicate: Optional predicate to select a subset of data to compact (sql string).
+    :param target_file_size_bytes: Optional target file size in bytes. Defaults to system default (1gb for Delta 2.1) if not provided.
+    :param vacuum_only: If set to True, will perform a vacuum operation w/o compaction.
+    :param refresh_cache: Refreshes table cache for this spark session.
+    :return:
+    """
+    spark_session.conf.set("spark.databricks.delta.optimize.repartition.enabled", "true")
+
+    table_to_compact = (
+        DeltaTable.forPath(sparkSession=spark_session, path=path)
+        if "://" in path
+        else DeltaTable.forName(sparkSession=spark_session, tableOrViewName=path)
+    )
+
+    if not vacuum_only:
+        if target_file_size_bytes:
+            spark_session.conf.set("spark.databricks.delta.optimize.minFileSize", str(target_file_size_bytes))
+            spark_session.conf.set("spark.databricks.delta.optimize.maxFileSize", str(target_file_size_bytes))
+
+        if compact_from_predicate:
+            table_to_compact.optimize().where(compact_from_predicate).executeCompaction()
+        else:
+            table_to_compact.optimize().executeCompaction()
+
+    table_path = f"delta.`{path}`" if "://" in path else path
+    current_interval = int(
+        re.search(
+            r"\b\d+\b",
+            table_to_compact.detail().head().properties.get("delta.logRetentionDuration", "interval 168 hours"),
+        ).group()
+    )
+
+    if current_interval != round(retain_hours):
+        spark_session.sql(
+            f"ALTER table {table_path} SET TBLPROPERTIES ('delta.logRetentionDuration'='interval {round(retain_hours)} hours')"
+        )
+
+    table_to_compact.vacuum(retentionHours=retain_hours)
+
+    if refresh_cache:
+        if "://" in path:
+            spark_session.sql(f"refresh {path}")
+        else:
+            spark_session.sql(f"refresh table {path}")
diff --git a/test/common.py b/test/common.py
@@ -0,0 +1,13 @@
+import pathlib
+
+from pyspark.sql import SparkSession
+
+
+def generate_table(spark_session: SparkSession, suffix: str, dir=None) -> str:
+    test_data_path = f"{pathlib.Path(__file__).parent.resolve()}/{suffix}" if not dir else dir
+    df = spark_session.range(100)
+
+    for _ in range(10):
+        df.write.format("delta").mode("overwrite").save(test_data_path)
+
+    return test_data_path
diff --git a/test/conftest.py b/test/conftest.py
@@ -11,6 +11,7 @@ def spark_session():
         additional_configs={
             "spark.driver.extraJavaOptions": java_17_launch_options,
             "spark.executor.extraJavaOptions": java_17_launch_options,
+            "spark.databricks.delta.retentionDurationCheck.enabled": "false",
         }
     ).get_session()
 
diff --git a/test/test_delta_lake_functions.py b/test/test_delta_lake_functions.py
@@ -0,0 +1,22 @@
+from spark_utils.delta_lake.functions import delta_compact
+from glob import glob
+from pyspark.sql import SparkSession
+
+from test.common import generate_table
+
+
+def test_delta_compact(spark_session: SparkSession):
+    test_data_path = generate_table(spark_session, "compact")
+
+    delta_compact(
+        spark_session=spark_session,
+        path=f"file://{test_data_path}",
+        retain_hours=0,
+        vacuum_only=False,
+    )
+
+    num_parquet_files = len(glob(f"{test_data_path}/*.parquet"))
+    num_log_files = len(glob(f"{test_data_path}/_delta_log/*.json"))
+
+    # logs are cleaned on a daily basis, so we cannot test the log retention
+    assert num_parquet_files == 1 and num_log_files == 14

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ def spark_session():`
`11`	`11`	`additional_configs={`
`12`	`12`	`"spark.driver.extraJavaOptions": java_17_launch_options,`
`13`	`13`	`"spark.executor.extraJavaOptions": java_17_launch_options,`
	`14`	`+ "spark.databricks.delta.retentionDurationCheck.enabled": "false",`
`14`	`15`	`}`
`15`	`16`	`).get_session()`
`16`	`17`