@@ -2791,3 +2791,129 @@ def test_key_value_args(started_cluster):
27912791 f"S3(\\ '{ url } \\ ', \\ 'TSVRaw\\ ', format = \\ 'TSVRaw\\ ', access_key_id = \\ 'minio\\ ', secret_access_key = \\ '[HIDDEN]\\ ', compression_method = \\ 'gzip\\ ')"
27922792 in node .query (f"SHOW CREATE TABLE { table_name } " )
27932793 )
2794+
2795+
2796+ def test_file_pruning_with_hive_style_partitioning (started_cluster ):
2797+ node = started_cluster .instances ["dummy" ]
2798+ table_name = f"test_pruning_with_hive_style_partitioning_{ generate_random_string ()} "
2799+ bucket = started_cluster .minio_bucket
2800+ minio = started_cluster .minio_client
2801+
2802+ url = f"http://{ started_cluster .minio_host } :{ started_cluster .minio_port } /{ bucket } /{ table_name } "
2803+ node .query (
2804+ f"""
2805+ CREATE TABLE { table_name } (a Int32, b Int32, c String) ENGINE = S3('{ url } ', format = 'Parquet', partition_strategy = 'hive')
2806+ PARTITION BY (b, c)
2807+ """
2808+ )
2809+ node .query (
2810+ f"INSERT INTO { table_name } SELECT number, number % 5, toString(number % 2) FROM numbers(20)" ,
2811+ settings = {"use_hive_partitioning" : True },
2812+ )
2813+
2814+ objects = []
2815+ for obj in list (
2816+ minio .list_objects (
2817+ started_cluster .minio_bucket ,
2818+ prefix = table_name ,
2819+ recursive = True ,
2820+ )
2821+ ):
2822+ objects .append (obj .object_name )
2823+
2824+ objects .sort ()
2825+ assert len (objects ) == 10
2826+
2827+ prefixes = []
2828+ for object in objects :
2829+ assert object .endswith (".parquet" )
2830+ path = Path (object )
2831+ prefixes .append (str (path .parent ))
2832+
2833+ assert len (prefixes ) == 10
2834+ assert prefixes == [
2835+ f"{ table_name } /b=0/c=0" ,
2836+ f"{ table_name } /b=0/c=1" ,
2837+ f"{ table_name } /b=1/c=0" ,
2838+ f"{ table_name } /b=1/c=1" ,
2839+ f"{ table_name } /b=2/c=0" ,
2840+ f"{ table_name } /b=2/c=1" ,
2841+ f"{ table_name } /b=3/c=0" ,
2842+ f"{ table_name } /b=3/c=1" ,
2843+ f"{ table_name } /b=4/c=0" ,
2844+ f"{ table_name } /b=4/c=1" ,
2845+ ]
2846+
2847+ def check_read_files (expected , query_id ):
2848+ node .query ("SYSTEM FLUSH LOGS" )
2849+ assert expected == int (
2850+ node .query (
2851+ f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id = '{ query_id } ' AND type='QueryFinish'"
2852+ )
2853+ )
2854+
2855+ # 5 files, each file contains 2 rows
2856+ assert 5 == int (
2857+ node .query (f"SELECT uniqExact(_path) FROM { table_name } WHERE c == '0'" )
2858+ )
2859+
2860+ query_id = f"{ table_name } _query_1"
2861+ assert 10 == int (
2862+ node .query (
2863+ f"SELECT count() FROM { table_name } WHERE c == '0'" , query_id = query_id
2864+ )
2865+ )
2866+ # Check files are pruned.
2867+ check_read_files (5 , query_id )
2868+
2869+ # 2 files, each contains 2 rows
2870+ assert 2 == int (
2871+ node .query (f"SELECT uniqExact(_path) FROM { table_name } WHERE b == 3" )
2872+ )
2873+
2874+ query_id = f"{ table_name } _query_2"
2875+ assert 4 == int (
2876+ node .query (f"SELECT count() FROM { table_name } WHERE b == 3" , query_id = query_id )
2877+ )
2878+ # Check files are pruned.
2879+ check_read_files (2 , query_id )
2880+
2881+ # 1 file with 2 rows.
2882+ assert 1 == int (
2883+ node .query (
2884+ f"SELECT uniqExact(_path) FROM { table_name } WHERE b == 3 AND c == '1'"
2885+ )
2886+ )
2887+
2888+ query_id = f"{ table_name } _query_3"
2889+ assert 2 == int (
2890+ node .query (
2891+ f"SELECT count() FROM { table_name } WHERE b == 3 AND c == '1'" ,
2892+ query_id = query_id ,
2893+ )
2894+ )
2895+ # Check files are pruned.
2896+ check_read_files (1 , query_id )
2897+
2898+ query_id = f"{ table_name } _query_4"
2899+ assert 1 == int (
2900+ node .query (f"SELECT count() FROM { table_name } WHERE a == 1" , query_id = query_id )
2901+ )
2902+ # Nothing is pruned, because `a` is not a partition column.
2903+ check_read_files (10 , query_id )
2904+
2905+
2906+ def test_partition_by_without_wildcard (started_cluster ):
2907+ node = started_cluster .instances ["dummy" ]
2908+ table_name = f"test_partition_by_without_wildcard_{ generate_random_string ()} "
2909+ bucket = started_cluster .minio_bucket
2910+
2911+ url = f"http://{ started_cluster .minio_host } :{ started_cluster .minio_port } /{ bucket } /{ table_name } "
2912+ # An exception "Partition strategy wildcard can not be used without a '_partition_id' wildcard"
2913+ # should not be thrown.
2914+ node .query (
2915+ f"""
2916+ CREATE TABLE { table_name } (a Int32, b Int32, c String) ENGINE = S3('{ url } ', format = 'Parquet')
2917+ PARTITION BY (b, c)
2918+ """
2919+ )
0 commit comments