Altinity
diff --git a/‎src/Core/ProtocolDefines.h‎
Lines changed: 4 additions & 2 deletions b/‎src/Core/ProtocolDefines.h‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/Core/Settings.cpp‎
Lines changed: 13 additions & 0 deletions b/‎src/Core/Settings.cpp‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/Core/Settings.h‎
Lines changed: 3 additions & 1 deletion b/‎src/Core/Settings.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/Core/SettingsChangesHistory.cpp‎
Lines changed: 88 additions & 0 deletions b/‎src/Core/SettingsChangesHistory.cpp‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎src/Core/SettingsEnums.cpp‎
Lines changed: 6 additions & 0 deletions b/‎src/Core/SettingsEnums.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/Core/SettingsEnums.h‎
Lines changed: 8 additions & 0 deletions b/‎src/Core/SettingsEnums.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/Disks/ObjectStorages/IObjectStorage.h‎
Lines changed: 1 addition & 0 deletions b/‎src/Disks/ObjectStorages/IObjectStorage.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/Formats/FormatFactory.cpp‎
Lines changed: 29 additions & 0 deletions b/‎src/Formats/FormatFactory.cpp‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎src/Formats/FormatFactory.h‎
Lines changed: 13 additions & 0 deletions b/‎src/Formats/FormatFactory.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/Interpreters/ClusterFunctionReadTask.cpp‎
Lines changed: 33 additions & 1 deletion b/‎src/Interpreters/ClusterFunctionReadTask.cpp‎
Lines changed: 33 additions & 1 deletion
@@ -35,8 +35,10 @@ static constexpr auto DBMS_MIN_REVISION_WITH_AGGREGATE_FUNCTIONS_VERSIONING = 54
 
 static constexpr auto DBMS_CLUSTER_INITIAL_PROCESSING_PROTOCOL_VERSION = 1;
 static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_METADATA = 2;
-static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_COLUMNS_METADATA = 3;
-static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION = 3;
+static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_ICEBERG_METADATA = 3;
+static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_FILE_BUCKETS_INFO = 4;
+static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_COLUMNS_METADATA = 5;
+static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION = DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_FILE_BUCKETS_INFO;
 
 static constexpr auto DBMS_MIN_SUPPORTED_PARALLEL_REPLICAS_PROTOCOL_VERSION = 3;
 static constexpr auto DBMS_PARALLEL_REPLICAS_MIN_VERSION_WITH_MARK_SEGMENT_SIZE_FIELD = 4;
 
@@ -6789,6 +6789,19 @@ Both database and table names have to be unquoted - only simple identifiers are
 )", 0) \
     DECLARE(Bool, allow_general_join_planning, true, R"(
 Allows a more general join planning algorithm that can handle more complex conditions, but only works with hash join. If hash join is not enabled, then the usual join planning algorithm is used regardless of the value of this setting.
+)", 0) \
+    DECLARE(ObjectStorageGranularityLevel, cluster_table_function_split_granularity, ObjectStorageGranularityLevel::FILE, R"(
+Controls how data is split into tasks when executing a CLUSTER TABLE FUNCTION.
+
+This setting defines the granularity of work distribution across the cluster:
+- `file` — each task processes an entire file.
+- `bucket` — tasks are created per internal data block within a file (for example, Parquet row groups).
+
+Choosing finer granularity (like `bucket`) can improve parallelism when working with a small number of large files.
+For instance, if a Parquet file contains multiple row groups, enabling `bucket` granularity allows each group to be processed independently by different workers.
+)", 0) \
+    DECLARE(UInt64, cluster_table_function_buckets_batch_size, 0, R"(
+Defines the approximate size of a batch (in bytes) used in distributed processing of tasks in cluster table functions with `bucket` split granularity. The system accumulates data until at least this amount is reached. The actual size may be slightly larger to align with data boundaries.
 )", 0) \
     DECLARE(UInt64, merge_table_max_tables_to_look_for_schema_inference, 1000, R"(
 When creating a `Merge` table without an explicit schema or when using the `merge` table function, infer schema as a union of not more than the specified number of matching tables.
 
@@ -109,7 +109,9 @@ class WriteBuffer;
     M(CLASS_NAME, UInt64Auto) \
     M(CLASS_NAME, URI) \
     M(CLASS_NAME, VectorSearchFilterStrategy) \
-    M(CLASS_NAME, GeoToH3ArgumentOrder)
+    M(CLASS_NAME, GeoToH3ArgumentOrder) \
+    M(CLASS_NAME, ObjectStorageGranularityLevel) \
+    M(CLASS_NAME, DecorrelationJoinKind)
 
 
 COMMON_SETTINGS_SUPPORTED_TYPES(Settings, DECLARE_SETTING_TRAIT)
 
@@ -58,6 +58,94 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
             {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."},
             {"export_merge_tree_part_max_bytes_per_file", 0, 0, "New setting."},
             {"export_merge_tree_part_max_rows_per_file", 0, 0, "New setting."},
+            {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."}
+            {"cluster_table_function_split_granularity", "file", "file", "New setting."},
+            {"cluster_table_function_buckets_batch_size", 0, 0, "New setting."},
+            {"arrow_flight_request_descriptor_type", "path", "path", "New setting. Type of descriptor to use for Arrow Flight requests: 'path' or 'command'. Dremio requires 'command'."},
+            {"send_profile_events", true, true, "New setting. Whether to send profile events to the clients."},
+            {"into_outfile_create_parent_directories", false, false, "New setting"},
+            {"correlated_subqueries_default_join_kind", "left", "right", "New setting. Default join kind for decorrelated query plan."},
+            {"use_statistics_cache", 0, 0, "New setting"},
+            {"input_format_parquet_use_native_reader_v3", false, true, "Seems stable"},
+            {"max_projection_rows_to_use_projection_index", 1'000'000, 1'000'000, "New setting"},
+            {"min_table_rows_to_use_projection_index", 1'000'000, 1'000'000, "New setting"},
+            {"use_text_index_dictionary_cache", false, false, "New setting"},
+            {"use_text_index_header_cache", false, false, "New setting"},
+            {"use_text_index_postings_cache", false, false, "New setting"},
+            {"s3_retry_attempts", 500, 500, "Changed the value of the obsolete setting"},
+            {"http_write_exception_in_output_format", true, false, "Changed for consistency across formats"},
+            {"optimize_const_name_size", -1, 256, "Replace with scalar and use hash as a name for large constants (size is estimated by name length)"},
+            {"enable_lazy_columns_replication", false, true, "Enable lazy columns replication in JOIN and ARRAY JOIN by default"},
+            {"allow_special_serialization_kinds_in_output_formats", false, true, "Enable direct output of special columns representations like Sparse/Replicated in some output formats"},
+            {"allow_experimental_alias_table_engine", false, false, "New setting"},
+            {"input_format_parquet_local_time_as_utc", false, true, "Use more appropriate type DateTime64(..., 'UTC') for parquet 'local time without timezone' type."},
+            {"input_format_parquet_verify_checksums", true, true, "New setting."},
+            {"output_format_parquet_write_checksums", false, true, "New setting."},
+            {"database_shared_drop_table_delay_seconds", 8 * 60 * 60, 8 * 60 * 60, "New setting."},
+            {"filesystem_cache_allow_background_download", true, true, "New setting to control background downloads in filesystem cache per query."},
+        });
+        addSettingsChanges(settings_changes_history, "25.10",
+        {
+            {"allow_special_serialization_kinds_in_output_formats", false, false, "Add a setting to allow output of special columns representations like Sparse/Replicated without converting them to full columns"},
+            {"enable_lazy_columns_replication", false, false, "Add a setting to enable lazy columns replication in JOIN and ARRAY JOIN"},
+            {"correlated_subqueries_default_join_kind", "left", "right", "New setting. Default join kind for decorrelated query plan."},
+            {"show_data_lake_catalogs_in_system_tables", true, false, "Disable catalogs in system tables by default"},
+            {"optimize_rewrite_like_perfect_affix", false, true, "New setting"},
+            {"allow_dynamic_type_in_join_keys", true, false, "Disallow using Dynamic type in JOIN keys by default"},
+            {"s3queue_keeper_fault_injection_probability", 0, 0, "New setting."},
+            {"enable_join_runtime_filters", false, false, "New setting"},
+            {"join_runtime_filter_exact_values_limit", 10000, 10000, "New setting"},
+            {"join_runtime_bloom_filter_bytes", 512_KiB, 512_KiB, "New setting"},
+            {"join_runtime_bloom_filter_hash_functions", 3, 3, "New setting"},
+            {"use_join_disjunctions_push_down", false, false, "New setting."},
+            {"joined_block_split_single_row", false, false, "New setting"},
+            {"temporary_files_buffer_size", DBMS_DEFAULT_BUFFER_SIZE, DBMS_DEFAULT_BUFFER_SIZE, "New setting"},
+            {"rewrite_in_to_join", false, false, "New experimental setting"},
+            {"delta_lake_log_metadata", false, false, "New setting."},
+            {"distributed_cache_prefer_bigger_buffer_size", false, false, "New setting."},
+            {"allow_experimental_qbit_type", false, false, "New experimental setting"},
+            {"optimize_qbit_distance_function_reads", true, true, "New setting"},
+            {"read_from_distributed_cache_if_exists_otherwise_bypass_cache", false, false, "New setting"},
+            {"s3_slow_all_threads_after_retryable_error", false, false, "Disable the setting by default"},
+            {"backup_slow_all_threads_after_retryable_s3_error", false, false, "Disable the setting by default"},
+            {"enable_http_compression", false, true, "It should be beneficial in general"},
+            {"inject_random_order_for_select_without_order_by", false, false, "New setting"},
+            {"exclude_materialize_skip_indexes_on_insert", "", "", "New setting."},
+            {"optimize_empty_string_comparisons", false, true, "A new setting."},
+            {"query_plan_use_logical_join_step", true, true, "Added alias"},
+            {"schema_inference_make_columns_nullable", 1, 3, "Take nullability information from Parquet/ORC/Arrow metadata by default, instead of making everything nullable."},
+            {"materialized_views_squash_parallel_inserts", false, true, "Added setting to preserve old behavior if needed."},
+            {"distributed_cache_connect_timeout_ms", 50, 50, "New setting"},
+            {"distributed_cache_receive_timeout_ms", 3000, 3000, "New setting"},
+            {"distributed_cache_send_timeout_ms", 3000, 3000, "New setting"},
+            {"distributed_cache_tcp_keep_alive_timeout_ms", 2900, 2900, "New setting"},
+        });
+        addSettingsChanges(settings_changes_history, "25.9",
+        {
+            {"input_format_protobuf_oneof_presence", false, false, "New setting"},
+            {"iceberg_delete_data_on_drop", false, false, "New setting"},
+            {"use_skip_indexes_on_data_read", false, false, "New setting"},
+            {"s3_slow_all_threads_after_retryable_error", false, false, "Added an alias for setting `backup_slow_all_threads_after_retryable_s3_error`"},
+            {"iceberg_metadata_log_level", "none", "none", "New setting."},
+            {"iceberg_insert_max_rows_in_data_file", 1000000, 1000000, "New setting."},
+            {"iceberg_insert_max_bytes_in_data_file", 1_GiB, 1_GiB, "New setting."},
+            {"query_plan_optimize_join_order_limit", 1, 1, "New setting"},
+            {"query_plan_display_internal_aliases", false, false, "New setting"},
+            {"query_plan_max_step_description_length", 1000000000, 500, "New setting"},
+            {"allow_experimental_delta_lake_writes", false, false, "New setting."},
+            {"query_plan_convert_any_join_to_semi_or_anti_join", true, true, "New setting."},
+            {"text_index_use_bloom_filter", true, true, "New setting."},
+            {"query_plan_direct_read_from_text_index", true, true, "New setting."},
+            {"enable_producing_buckets_out_of_order_in_aggregation", false, true, "New setting"},
+            {"jemalloc_enable_profiler", false, false, "New setting"},
+            {"jemalloc_collect_profile_samples_in_trace_log", false, false, "New setting"},
+            {"delta_lake_insert_max_bytes_in_data_file", 1_GiB, 1_GiB, "New setting."},
+            {"delta_lake_insert_max_rows_in_data_file", 1000000, 1000000, "New setting."},
+            {"promql_evaluation_time", Field{"auto"}, Field{"auto"}, "The setting was renamed. The previous name is `evaluation_time`."},
+            {"evaluation_time", 0, 0, "Old setting which popped up here being renamed."},
+            {"os_threads_nice_value_query", 0, 0, "New setting."},
+            {"os_threads_nice_value_materialized_view", 0, 0, "New setting."},
+            {"os_thread_priority", 0, 0, "Alias for os_threads_nice_value_query."},
         });
         addSettingsChanges(settings_changes_history, "25.8",
         {
 
@@ -372,4 +372,10 @@ IMPLEMENT_SETTING_ENUM(
      {"manifest_file_entry", IcebergMetadataLogLevel::ManifestFileEntry}})
 
 IMPLEMENT_SETTING_AUTO_ENUM(MergeTreePartExportFileAlreadyExistsPolicy, ErrorCodes::BAD_ARGUMENTS);
+
+IMPLEMENT_SETTING_ENUM(
+    ObjectStorageGranularityLevel,
+    ErrorCodes::BAD_ARGUMENTS,
+    {{"file", ObjectStorageGranularityLevel::FILE},
+    {"bucket", ObjectStorageGranularityLevel::BUCKET}})
 }
@@ -481,6 +481,14 @@ enum class IcebergMetadataLogLevel : uint8_t
 
 DECLARE_SETTING_ENUM(IcebergMetadataLogLevel)
 
+enum class ObjectStorageGranularityLevel : uint8_t
+{
+    FILE = 0,
+    BUCKET = 1,
+};
+
+DECLARE_SETTING_ENUM(ObjectStorageGranularityLevel)
+
 enum class MergeTreePartExportFileAlreadyExistsPolicy : uint8_t
 {
     skip,
 
@@ -29,6 +29,7 @@
 #include <Disks/WriteMode.h>
 
 #include <Processors/ISimpleTransform.h>
+#include <Processors/Formats/IInputFormat.h>
 #include <Storages/ObjectStorage/DataLakes/DataLakeObjectMetadata.h>
 
 #include <Interpreters/Context_fwd.h>
 
@@ -374,6 +374,20 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
     return format_settings;
 }
 
+FileBucketInfoPtr FormatFactory::getFileBucketInfo(const String & format)
+{
+    auto creator = getCreators(format);
+    return creator.file_bucket_info_creator();
+}
+
+void FormatFactory::registerFileBucketInfo(const String & format, FileBucketInfoCreator bucket_info)
+{
+    chassert(bucket_info);
+    auto & creators = getOrCreateCreators(format);
+    if (creators.file_bucket_info_creator)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Bucket splitter for format {} is already registered", format);
+    creators.file_bucket_info_creator = std::move(bucket_info);
+}
 
 InputFormatPtr FormatFactory::getInput(
     const String & name,
@@ -694,6 +708,21 @@ void FormatFactory::registerInputFormat(const String & name, InputCreator input_
     KnownFormatNames::instance().add(name, /* case_insensitive = */ true);
 }
 
+void FormatFactory::registerSplitter(const String & format, BucketSplitterCreator splitter)
+{
+    chassert(splitter);
+    auto & creators = getOrCreateCreators(format);
+    if (creators.bucket_splitter_creator)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Bucket splitter for format {} is already registered", format);
+    creators.bucket_splitter_creator = std::move(splitter);
+}
+
+BucketSplitter FormatFactory::getSplitter(const String & format)
+{
+    auto creator = getCreators(format);
+    return creator.bucket_splitter_creator();
+}
+
 void FormatFactory::registerRandomAccessInputFormat(const String & name, RandomAccessInputCreator input_creator)
 {
     chassert(input_creator);
 
@@ -10,6 +10,8 @@
 #include <base/types.h>
 #include <Common/Allocator.h>
 
+#include <Processors/Formats/IInputFormat.h>
+
 #include <boost/noncopyable.hpp>
 
 #include <functional>
@@ -94,6 +96,10 @@ class FormatFactory final : private boost::noncopyable
             const RowInputFormatParams & params,
             const FormatSettings & settings)>;
 
+    using FileBucketInfoCreator = std::function<FileBucketInfoPtr()>;
+
+    using BucketSplitterCreator = std::function<BucketSplitter()>;
+
     // Incompatible with FileSegmentationEngine.
     using RandomAccessInputCreator = std::function<InputFormatPtr(
         ReadBuffer & buf,
@@ -142,6 +148,8 @@ class FormatFactory final : private boost::noncopyable
     {
         String name;
         InputCreator input_creator;
+        FileBucketInfoCreator file_bucket_info_creator;
+        BucketSplitterCreator bucket_splitter_creator;
         RandomAccessInputCreator random_access_input_creator;
         OutputCreator output_creator;
         FileSegmentationEngineCreator file_segmentation_engine_creator;
@@ -286,6 +294,11 @@ class FormatFactory final : private boost::noncopyable
     void checkFormatName(const String & name) const;
     bool exists(const String & name) const;
 
+    FileBucketInfoPtr getFileBucketInfo(const String & format);
+    void registerFileBucketInfo(const String & format, FileBucketInfoCreator bucket_info);
+    void registerSplitter(const String & format, BucketSplitterCreator splitter);
+    BucketSplitter getSplitter(const String & format);
+
 private:
     FormatsDictionary dict;
     FileExtensionFormats file_extension_formats;
 
@@ -7,7 +7,10 @@
 #include <IO/ReadHelpers.h>
 #include <Interpreters/ActionsDAG.h>
 #include <Storages/ObjectStorage/StorageObjectStorageSource.h>
+#include <Common/Exception.h>
 #include <Common/logger_useful.h>
+#include <Formats/FormatFactory.h>
+#include <Processors/Formats/Impl/ParquetBlockInputFormat.h>
 
 namespace DB
 {
@@ -40,6 +43,7 @@ ClusterFunctionReadTaskResponse::ClusterFunctionReadTaskResponse(ObjectInfoPtr o
         const bool send_over_whole_archive = !context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes];
         path = send_over_whole_archive ? object->getPathOrPathToArchiveIfArchive() : object->getPath();
         absolute_path = object->getAbsolutePath();
+        file_bucket_info = object->file_bucket_info;
     }
 }
 
@@ -58,7 +62,9 @@ ObjectInfoPtr ClusterFunctionReadTaskResponse::getObjectInfo() const
     object->file_meta_info = file_meta_info;
     if (absolute_path.has_value() && !absolute_path.value().empty())
         object->absolute_path = absolute_path;
-    
+
+    object->file_bucket_info = file_bucket_info;
+
     return object;
 }
 
@@ -76,6 +82,21 @@ void ClusterFunctionReadTaskResponse::serialize(WriteBuffer & out, size_t protoc
             ActionsDAG().serialize(out, registry);
     }
 
+    if (protocol_version >= DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_FILE_BUCKETS_INFO)
+    {
+        if (file_bucket_info)
+        {
+            /// Write format name so we can create appropriate file bucket info during deserialization.
+            writeStringBinary(file_bucket_info->getFormatName(), out);
+            file_bucket_info->serialize(out);
+        }
+        else
+        {
+            /// Write empty string as format name if file_bucket_info is not set.
+            writeStringBinary("", out);
+        }
+    }
+
     if (protocol_version >= DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_COLUMNS_METADATA)
     {
         /// This info is not used when optimization is disabled, so there is no need to send it.
@@ -111,6 +132,17 @@ void ClusterFunctionReadTaskResponse::deserialize(ReadBuffer & in)
         }
     }
 
+    if (protocol_version >= DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_FILE_BUCKETS_INFO)
+    {
+        String format;
+        readStringBinary(format, in);
+        if (!format.empty())
+        {
+            file_bucket_info = FormatFactory::instance().getFileBucketInfo(format);
+            file_bucket_info->deserialize(in);
+        }
+    }
+
     if (protocol_version >= DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_COLUMNS_METADATA)
     {
         auto info = std::make_shared<DataFileMetaInfo>(DataFileMetaInfo::deserialize(in));