Skip to content

Commit d122461

Browse files
Backport ClickHouse#86748 to 25.8: Fix incompatible exception about wildcard partition strategy
1 parent 25a47e3 commit d122461

File tree

5 files changed

+139
-5
lines changed

5 files changed

+139
-5
lines changed

src/Storages/IPartitionStrategy.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,8 +232,16 @@ std::shared_ptr<IPartitionStrategy> PartitionStrategyFactory::get(StrategyType s
232232
globbed_path,
233233
partition_columns_in_data_file);
234234
case StrategyType::NONE:
235+
{
236+
if (!partition_columns_in_data_file && strategy == PartitionStrategyFactory::StrategyType::NONE)
237+
{
238+
throw Exception(
239+
ErrorCodes::BAD_ARGUMENTS,
240+
"Partition strategy `none` cannot be used with partition_columns_in_data_file=0");
241+
}
235242
/// Unreachable for plain object storage, used only by Data Lakes for now
236243
return nullptr;
244+
}
237245
}
238246
}
239247

src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,11 @@ void StorageObjectStorageConfiguration::initialize(
8282
}
8383
else if (configuration_to_initialize.partition_strategy_type == PartitionStrategyFactory::StrategyType::NONE)
8484
{
85-
// Promote to wildcard in case it is not data lake to make it backwards compatible
86-
configuration_to_initialize.partition_strategy_type = PartitionStrategyFactory::StrategyType::WILDCARD;
85+
if (configuration_to_initialize.getRawPath().hasPartitionWildcard())
86+
{
87+
// Promote to wildcard in case it is not data lake to make it backwards compatible
88+
configuration_to_initialize.partition_strategy_type = PartitionStrategyFactory::StrategyType::WILDCARD;
89+
}
8790
}
8891

8992
if (configuration_to_initialize.format == "auto")

tests/integration/test_storage_s3/test.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2791,3 +2791,129 @@ def test_key_value_args(started_cluster):
27912791
f"S3(\\'{url}\\', \\'TSVRaw\\', format = \\'TSVRaw\\', access_key_id = \\'minio\\', secret_access_key = \\'[HIDDEN]\\', compression_method = \\'gzip\\')"
27922792
in node.query(f"SHOW CREATE TABLE {table_name}")
27932793
)
2794+
2795+
2796+
def test_file_pruning_with_hive_style_partitioning(started_cluster):
2797+
node = started_cluster.instances["dummy"]
2798+
table_name = f"test_pruning_with_hive_style_partitioning_{generate_random_string()}"
2799+
bucket = started_cluster.minio_bucket
2800+
minio = started_cluster.minio_client
2801+
2802+
url = f"http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{table_name}"
2803+
node.query(
2804+
f"""
2805+
CREATE TABLE {table_name} (a Int32, b Int32, c String) ENGINE = S3('{url}', format = 'Parquet', partition_strategy = 'hive')
2806+
PARTITION BY (b, c)
2807+
"""
2808+
)
2809+
node.query(
2810+
f"INSERT INTO {table_name} SELECT number, number % 5, toString(number % 2) FROM numbers(20)",
2811+
settings={"use_hive_partitioning": True},
2812+
)
2813+
2814+
objects = []
2815+
for obj in list(
2816+
minio.list_objects(
2817+
started_cluster.minio_bucket,
2818+
prefix=table_name,
2819+
recursive=True,
2820+
)
2821+
):
2822+
objects.append(obj.object_name)
2823+
2824+
objects.sort()
2825+
assert len(objects) == 10
2826+
2827+
prefixes = []
2828+
for object in objects:
2829+
assert object.endswith(".parquet")
2830+
path = Path(object)
2831+
prefixes.append(str(path.parent))
2832+
2833+
assert len(prefixes) == 10
2834+
assert prefixes == [
2835+
f"{table_name}/b=0/c=0",
2836+
f"{table_name}/b=0/c=1",
2837+
f"{table_name}/b=1/c=0",
2838+
f"{table_name}/b=1/c=1",
2839+
f"{table_name}/b=2/c=0",
2840+
f"{table_name}/b=2/c=1",
2841+
f"{table_name}/b=3/c=0",
2842+
f"{table_name}/b=3/c=1",
2843+
f"{table_name}/b=4/c=0",
2844+
f"{table_name}/b=4/c=1",
2845+
]
2846+
2847+
def check_read_files(expected, query_id):
2848+
node.query("SYSTEM FLUSH LOGS")
2849+
assert expected == int(
2850+
node.query(
2851+
f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id = '{query_id}' AND type='QueryFinish'"
2852+
)
2853+
)
2854+
2855+
# 5 files, each file contains 2 rows
2856+
assert 5 == int(
2857+
node.query(f"SELECT uniqExact(_path) FROM {table_name} WHERE c == '0'")
2858+
)
2859+
2860+
query_id = f"{table_name}_query_1"
2861+
assert 10 == int(
2862+
node.query(
2863+
f"SELECT count() FROM {table_name} WHERE c == '0'", query_id=query_id
2864+
)
2865+
)
2866+
# Check files are pruned.
2867+
check_read_files(5, query_id)
2868+
2869+
# 2 files, each contains 2 rows
2870+
assert 2 == int(
2871+
node.query(f"SELECT uniqExact(_path) FROM {table_name} WHERE b == 3")
2872+
)
2873+
2874+
query_id = f"{table_name}_query_2"
2875+
assert 4 == int(
2876+
node.query(f"SELECT count() FROM {table_name} WHERE b == 3", query_id=query_id)
2877+
)
2878+
# Check files are pruned.
2879+
check_read_files(2, query_id)
2880+
2881+
# 1 file with 2 rows.
2882+
assert 1 == int(
2883+
node.query(
2884+
f"SELECT uniqExact(_path) FROM {table_name} WHERE b == 3 AND c == '1'"
2885+
)
2886+
)
2887+
2888+
query_id = f"{table_name}_query_3"
2889+
assert 2 == int(
2890+
node.query(
2891+
f"SELECT count() FROM {table_name} WHERE b == 3 AND c == '1'",
2892+
query_id=query_id,
2893+
)
2894+
)
2895+
# Check files are pruned.
2896+
check_read_files(1, query_id)
2897+
2898+
query_id = f"{table_name}_query_4"
2899+
assert 1 == int(
2900+
node.query(f"SELECT count() FROM {table_name} WHERE a == 1", query_id=query_id)
2901+
)
2902+
# Nothing is pruned, because `a` is not a partition column.
2903+
check_read_files(10, query_id)
2904+
2905+
2906+
def test_partition_by_without_wildcard(started_cluster):
2907+
node = started_cluster.instances["dummy"]
2908+
table_name = f"test_partition_by_without_wildcard_{generate_random_string()}"
2909+
bucket = started_cluster.minio_bucket
2910+
2911+
url = f"http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{table_name}"
2912+
# An exception "Partition strategy wildcard can not be used without a '_partition_id' wildcard"
2913+
# should not be thrown.
2914+
node.query(
2915+
f"""
2916+
CREATE TABLE {table_name} (a Int32, b Int32, c String) ENGINE = S3('{url}', format = 'Parquet')
2917+
PARTITION BY (b, c)
2918+
"""
2919+
)

tests/queries/0_stateless/03547_s3_partition_by_require_partition_wildcard.reference

Whitespace-only changes.

tests/queries/0_stateless/03547_s3_partition_by_require_partition_wildcard.sql

Lines changed: 0 additions & 3 deletions
This file was deleted.

0 commit comments

Comments
 (0)