Merge pull request ClickHouse#84745 from ClickHouse/delta-lake-fix-column-pruning

kssenii · web-flow · commit aadc4331baa6 · 2025-07-31T15:26:34.000Z
Fix column pruning with delta-kernel
diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp
@@ -63,17 +63,28 @@ ReadFromFormatInfo DeltaLakeMetadataDeltaKernel::prepareReadingFromFormat(
 {
     auto info = DB::prepareReadingFromFormat(requested_columns, storage_snapshot, context, supports_subset_of_columns);
 
-    info.format_header.clear();
-    for (const auto & [column_name, column_type] : table_snapshot->getReadSchema())
-        info.format_header.insert({column_type->createColumn(), column_type, column_name});
-
     /// Read schema is different from table schema in case:
     /// 1. we have partition columns (they are not stored in the actual data)
     /// 2. columnMapping.mode = 'name' or 'id'.
     /// So we add partition columns to read schema and put it together into format_header.
     /// Partition values will be added to result data right after data is read.
-
     const auto & physical_names_map = table_snapshot->getPhysicalNamesMap();
+    const auto read_columns = table_snapshot->getReadSchema().getNameSet();
+
+    Block format_header;
+    for (auto && column_with_type_and_name : info.format_header)
+    {
+        auto physical_name = DeltaLake::getPhysicalName(column_with_type_and_name.name, physical_names_map);
+        if (!read_columns.contains(physical_name))
+        {
+            LOG_TEST(log, "Filtering out non-readable column: {}", column_with_type_and_name.name);
+            continue;
+        }
+        column_with_type_and_name.name = physical_name;
+        format_header.insert(std::move(column_with_type_and_name));
+    }
+    info.format_header = std::move(format_header);
+
     /// Update requested columns to reference actual physical column names.
     if (!physical_names_map.empty())
     {
diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
@@ -550,15 +550,24 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade
 
         builder.init(Pipe(input_format));
 
-        std::shared_ptr<const ActionsDAG> transformer;
-        if (object_info->data_lake_metadata)
-            transformer = object_info->data_lake_metadata->transform;
+        std::optional<ActionsDAG> transformer;
+        if (object_info->data_lake_metadata && object_info->data_lake_metadata->transform)
+        {
+            transformer = object_info->data_lake_metadata->transform->clone();
+            /// FIXME: This is currently not done for the below case (configuration->getSchemaTransformer())
+            /// because it is an iceberg case where transformer contains columns ids (just increasing numbers)
+            /// which do not match requested_columns (while here requested_columns were adjusted to match physical columns).
+            transformer->removeUnusedActions(read_from_format_info.requested_columns.getNames());
+        }
         if (!transformer)
-            transformer = configuration->getSchemaTransformer(context_, object_info->getPath());
+        {
+            if (auto schema_transformer = configuration->getSchemaTransformer(context_, object_info->getPath()))
+                transformer = schema_transformer->clone();
+        }
 
-        if (transformer)
+        if (transformer.has_value())
         {
-            auto schema_modifying_actions = std::make_shared<ExpressionActions>(transformer->clone());
+            auto schema_modifying_actions = std::make_shared<ExpressionActions>(std::move(transformer.value()));
             builder.addSimpleTransform([&](const SharedHeader & header)
             {
                 return std::make_shared<ExpressionTransform>(header, schema_modifying_actions);
diff --git a/tests/integration/test_storage_delta/test.py b/tests/integration/test_storage_delta/test.py
@@ -2208,3 +2208,78 @@ def test_filtering_by_virtual_columns(started_cluster):
             f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id = '{query_id}' and type = 'QueryFinish'"
         )
     )
+
+
+def test_column_pruning(started_cluster):
+    instance = started_cluster.instances["node1"]
+    spark = started_cluster.spark_session
+    minio_client = started_cluster.minio_client
+    bucket = started_cluster.minio_bucket
+    TABLE_NAME = randomize_table_name("test_column_pruning")
+    result_file = f"{TABLE_NAME}"
+    partition_columns = []
+
+    schema = StructType(
+        [
+            StructField("id", IntegerType(), nullable=False),
+            StructField("name", StringType(), nullable=False),
+            StructField("age", IntegerType(), nullable=False),
+            StructField("country", StringType(), nullable=False),
+            StructField("year", StringType(), nullable=False),
+        ]
+    )
+
+    num_rows = 10000
+    now = datetime.now()
+    data = [
+        (i, f"name_{i}", 32, "".join("a" for _ in range(100)), "2025")
+        for i in range(num_rows)
+    ]
+    df = spark.createDataFrame(data=data, schema=schema)
+    df.printSchema()
+    df.write.mode("append").format("delta").partitionBy(partition_columns).save(
+        f"/{TABLE_NAME}"
+    )
+
+    minio_client = started_cluster.minio_client
+    bucket = started_cluster.minio_bucket
+
+    files = upload_directory(minio_client, bucket, f"/{TABLE_NAME}", "")
+    assert len(files) > 0
+    print(f"Uploaded files: {files}")
+
+    table_function = f"deltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/{bucket}/{result_file}/', 'minio', '{minio_secret_key}')"
+
+    query_id = f"query_{TABLE_NAME}_1"
+    sum = int(
+        instance.query(
+            f"SELECT sum(id) FROM {table_function} SETTINGS allow_experimental_delta_kernel_rs=0, max_read_buffer_size_remote_fs=100",
+            query_id=query_id,
+        )
+    )
+    instance.query("SYSTEM FLUSH LOGS")
+    assert 107220 == int(
+        instance.query(
+            f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE query_id = '{query_id}' and type = 'QueryFinish'"
+        )
+    )
+
+    query_id = f"query_{TABLE_NAME}_2"
+    assert sum == int(
+        instance.query(
+            f"SELECT sum(id) FROM {table_function} SETTINGS enable_filesystem_cache=0, max_read_buffer_size_remote_fs=100",
+            query_id=query_id,
+        )
+    )
+    instance.query("SYSTEM FLUSH LOGS")
+    assert 1 == int(
+        instance.query(
+            f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id = '{query_id}' and type = 'QueryFinish'"
+        )
+    )
+    # Small diff because in case of delta-kernel metadata reading is not counted in the metric.
+    assert 105677 == int(
+        instance.query(
+            f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE query_id = '{query_id}' and type = 'QueryFinish'"
+        )
+    )