|
| 1 | +# Databricks notebook source |
| 2 | +# MAGIC %md |
| 3 | +# MAGIC # Detect tables with too many small files |
| 4 | +# MAGIC |
| 5 | +# MAGIC Delta tables are composed of multiple `parquet` files. A table with too many small files might lead to performance degradation. The optimal file size depends on the workload, but it generally ranges between `10 MB` and `1000 MB`. |
| 6 | +# MAGIC |
| 7 | +# MAGIC As a rule of thumb, if a table has more than `100` files and average file size smaller than `10 MB`, then we can consider it having too many small files. |
| 8 | +# MAGIC |
| 9 | +# MAGIC Some common causes of too many small files are: |
| 10 | +# MAGIC * Overpartitioning: the cardinality of the partition columns is too high |
| 11 | +# MAGIC * Lack of scheduled maintenance operations like `OPTIMIZE` |
| 12 | +# MAGIC * Missing auto optimize on write |
| 13 | +# MAGIC |
| 14 | +# MAGIC This notebook will help you to identify the tables that might require a review. |
| 15 | + |
| 16 | +# COMMAND ---------- |
| 17 | + |
| 18 | +# MAGIC %pip install dbl-discoverx |
| 19 | + |
| 20 | +# COMMAND ---------- |
| 21 | + |
| 22 | +dbutils.widgets.text("from_tables", "*.*.*") |
| 23 | +from_tables = dbutils.widgets.get("from_tables") |
| 24 | + |
| 25 | +# Define how small is too small |
| 26 | +small_file_max_size_MB = 10 |
| 27 | + |
| 28 | +# It's okay to have small files as long as there are not too many |
| 29 | +min_file_number = 100 |
| 30 | + |
| 31 | +# COMMAND ---------- |
| 32 | + |
| 33 | +from discoverx import DX |
| 34 | + |
| 35 | +dx = DX() |
| 36 | + |
| 37 | +# COMMAND ---------- |
| 38 | + |
| 39 | +from pyspark.sql.functions import col, lit |
| 40 | + |
| 41 | +dx.from_tables(from_tables)\ |
| 42 | + .apply_sql("DESCRIBE DETAIL {full_table_name}")\ |
| 43 | + .to_union_dataframe()\ |
| 44 | + .withColumn("average_file_size_MB", col("sizeInBytes") / col("numFiles") / 1024 / 1024)\ |
| 45 | + .withColumn("has_too_many_small_files", |
| 46 | + (col("average_file_size_MB") < small_file_max_size_MB) & |
| 47 | + (col("numFiles") > min_file_number))\ |
| 48 | + .filter("has_too_many_small_files")\ |
| 49 | + .display() |
| 50 | + |
| 51 | +# COMMAND ---------- |
| 52 | + |
| 53 | + |
0 commit comments