patch: split Resize/StrictResize to avoid lock contention (#349)

wudidapaopao · web-flow · commit dda44b831c32 · 2025-07-15T11:35:39.000+08:00
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
@@ -6547,6 +6547,27 @@ When the query prioritization mechanism is employed (see setting `priority`), lo
 )", BETA) \
     DECLARE(Float, min_os_cpu_wait_time_ratio_to_throw, 0.0, "Min ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider rejecting queries. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 0 at this point.", 0) \
     DECLARE(Float, max_os_cpu_wait_time_ratio_to_throw, 0.0, "Max ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider rejecting queries. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 1 at this point.", 0) \
+    DECLARE(UInt64, min_outstreams_per_resize_after_split, 24, R"(
+Specifies the minimum number of output streams of a `Resize` or `StrictResize` processor after the split is performed during pipeline generation. If the resulting number of streams is less than this value, the split operation will not occur.
+### What is a Resize Node
+A `Resize` node is a processor in the query pipeline that adjusts the number of data streams flowing through the pipeline. It can either increase or decrease the number of streams to balance the workload across multiple threads or processors. For example, if a query requires more parallelism, the `Resize` node can split a single stream into multiple streams. Conversely, it can merge multiple streams into fewer streams to consolidate data processing.
+The `Resize` node ensures that data is evenly distributed across streams, maintaining the structure of the data blocks. This helps optimize resource utilization and improve query performance.
+### Why the Resize Node Needs to Be Split
+During pipeline execution, ExecutingGraph::Node::status_mutex of the centrally-hubbed `Resize` node is heavily contended especially in high-core-count environments, and this contention leads to:
+1. Increased latency for ExecutingGraph::updateNode, directly impacting query performance.
+2. Excessive CPU cycles are wasted in spin-lock contention (native_queued_spin_lock_slowpath), degrading efficiency.
+3. Reduced CPU utilization, limiting parallelism and throughput.
+### How the Resize Node Gets Split
+1. The number of output streams is checked to ensure the split could be performed: the output streams of each split processor meet or exceed the `min_outstreams_per_resize_after_split` threshold.
+2. The `Resize` node is divided into smaller `Resize` nodes with equal count of ports, each handling a subset of input and output streams.
+3. Each group is processed independently, reducing the lock contention.
+### Splitting Resize Node with Arbitrary Inputs/Outputs
+In some cases, where the inputs/outputs are indivisible by the number of split `Resize` nodes, some inputs are connected to `NullSource`s and some outputs are connected to `NullSink`s. This allows the split to occur without affecting the overall data flow.
+### Purpose of the Setting
+The `min_outstreams_per_resize_after_split` setting ensures that the splitting of `Resize` nodes is meaningful and avoids creating too few streams, which could lead to inefficient parallel processing. By enforcing a minimum number of output streams, this setting helps maintain a balance between parallelism and overhead, optimizing query execution in scenarios involving stream splitting and merging.
+### Disabling the Setting
+To disable the split of `Resize` nodes, set this setting to 0. This will prevent the splitting of `Resize` nodes during pipeline generation, allowing them to retain their original structure without division into smaller nodes.
+)", 0) \
     \
     /* ####################################################### */ \
     /* ########### START OF EXPERIMENTAL FEATURES ############ */ \
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
@@ -108,6 +108,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()
             {"input_format_parquet_allow_geoparquet_parser", false, true, "A new setting to use geo columns in parquet file"},
             {"enable_url_encoding", true, false, "Changed existing setting's default value"},
             {"s3_slow_all_threads_after_network_error", false, true, "New setting"},
+            {"min_outstreams_per_resize_after_split", 0, 24, "New setting."},
         });
         addSettingsChanges(settings_changes_history, "25.4",
         {
diff --git a/src/Processors/QueryPlan/AggregatingStep.cpp b/src/Processors/QueryPlan/AggregatingStep.cpp
@@ -495,7 +495,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
         /// Add resize transform to uniformly distribute data between aggregating streams.
         /// But not if we execute aggregation over partitioned data in which case data streams shouldn't be mixed.
         if (!storage_has_evenly_distributed_read && !skip_merging)
-            pipeline.resize(pipeline.getNumStreams(), true);
+            pipeline.resize(pipeline.getNumStreams(), true, settings.min_outstreams_per_resize_after_split);
 
         auto many_data = std::make_shared<ManyAggregatedData>(pipeline.getNumStreams());
 
@@ -514,7 +514,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                     skip_merging);
             });
 
-        pipeline.resize(should_produce_results_in_order_of_bucket_number ? 1 : params.max_threads);
+        pipeline.resize(should_produce_results_in_order_of_bucket_number ? 1 : params.max_threads, false, settings.min_outstreams_per_resize_after_split);
 
         aggregating = collector.detachProcessors(0);
     }
diff --git a/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp b/src/Processors/QueryPlan/BuildQueryPipelineSettings.cpp
@@ -10,6 +10,7 @@ namespace Setting
     extern const SettingsBool query_plan_merge_filters;
     extern const SettingsMaxThreads max_threads;
     extern const SettingsUInt64 aggregation_memory_efficient_merge_threads;
+    extern const SettingsUInt64 min_outstreams_per_resize_after_split;
 }
 
 BuildQueryPipelineSettings::BuildQueryPipelineSettings(ContextPtr from)
@@ -22,6 +23,7 @@ BuildQueryPipelineSettings::BuildQueryPipelineSettings(ContextPtr from)
 
     max_threads = from->getSettingsRef()[Setting::max_threads];
     aggregation_memory_efficient_merge_threads = from->getSettingsRef()[Setting::aggregation_memory_efficient_merge_threads];
+    min_outstreams_per_resize_after_split = from->getSettingsRef()[Setting::min_outstreams_per_resize_after_split];
 
     /// Setting query_plan_merge_filters is enabled by default.
     /// But it can brake short-circuit without splitting filter step into smaller steps.
diff --git a/src/Processors/QueryPlan/BuildQueryPipelineSettings.h b/src/Processors/QueryPlan/BuildQueryPipelineSettings.h
@@ -28,6 +28,7 @@ struct BuildQueryPipelineSettings
 
     size_t max_threads;
     size_t aggregation_memory_efficient_merge_threads;
+    size_t min_outstreams_per_resize_after_split;
 
     const ExpressionActionsSettings & getActionsSettings() const { return actions_settings; }
 };
diff --git a/src/QueryPipeline/Pipe.cpp b/src/QueryPipeline/Pipe.cpp
@@ -683,21 +683,102 @@ void Pipe::addChains(std::vector<Chain> chains)
     max_parallel_streams = std::max(max_parallel_streams, max_parallel_streams_for_chains);
 }
 
-void Pipe::resize(size_t num_streams, bool strict)
+void Pipe::addSplitResizeTransform(size_t num_streams, size_t min_outstreams_per_resize_after_split, bool strict)
+{
+    OutputPortRawPtrs resize_output_ports(num_streams);
+
+    size_t groups = std::min<size_t>(numOutputPorts(), num_streams / min_outstreams_per_resize_after_split);
+    size_t instream_per_group = (numOutputPorts() + groups - 1) / groups;
+    size_t groups_with_extra_instream = numOutputPorts() % groups;
+    size_t outstreams_per_group = (num_streams + groups - 1) / groups;
+    size_t groups_with_extra_outstream = num_streams % groups;
+
+    chassert(groups > 1);
+
+    for (size_t i = 0, next_input = 0, next_output = 0; i < groups; ++i)
+    {
+        ProcessorPtr resize;
+        if (strict)
+            resize = std::make_shared<StrictResizeProcessor>(getHeader(), instream_per_group, outstreams_per_group);
+        else
+            resize = std::make_shared<ResizeProcessor>(getHeader(), instream_per_group, outstreams_per_group);
+
+        for (auto it = resize->getInputs().begin(); it != resize->getInputs().end(); ++it)
+        {
+            if (std::next(it) != resize->getInputs().end() || groups_with_extra_instream == 0 || i < groups_with_extra_instream)
+            {
+                connect(*output_ports[next_input], *it);
+                ++next_input;
+            }
+            else
+            {
+                auto null_source = std::make_shared<NullSource>(getHeader());
+                connect(null_source->getPort(), *it);
+                processors->emplace_back(std::move(null_source));
+            }
+        }
+
+        for (auto it = resize->getOutputs().begin(); it != resize->getOutputs().end(); ++it)
+        {
+            if (std::next(it) != resize->getOutputs().end() || groups_with_extra_outstream == 0 || i < groups_with_extra_outstream)
+            {
+                resize_output_ports[next_output] = &*it;
+                ++next_output;
+            }
+            else
+            {
+                auto null_sink = std::make_shared<NullSink>(getHeader());
+                connect(*it, null_sink->getPort());
+                processors->emplace_back(std::move(null_sink));
+            }
+        }
+
+        if (collected_processors)
+            collected_processors->emplace_back(resize);
+        processors->emplace_back(std::move(resize));
+    }
+
+    output_ports = std::move(resize_output_ports);
+
+    header = output_ports.front()->getHeader();
+    for (size_t i = 1; i < output_ports.size(); ++i)
+        assertBlocksHaveEqualStructure(header, output_ports[i]->getHeader(), "Pipes");
+
+    max_parallel_streams = std::max<size_t>(max_parallel_streams, output_ports.size());
+}
+
+void Pipe::resize(size_t num_streams, bool strict, UInt64 min_outstreams_per_resize_after_split)
 {
     if (output_ports.empty())
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot resize an empty Pipe");
 
-    if (strict && num_streams == numOutputPorts())
-        return;
-
     /// We need to not add the resize in case of 1-1 because in case
     /// it is not force resize and we have n outputs (look at the code above),
     /// and one of the outputs is dead, we can push all data to n-1 outputs,
     /// which doesn't make sense for 1-1 scenario
     if (numOutputPorts() == 1 && num_streams == 1)
         return;
 
+    /// Performance bottleneck identified: Severe lock contention for ExecutingGraph::Node::status_mutex.
+    /// Issues observed:
+    /// 1. Increased latency of ExecutingGraph::updateNode, lengthening overall query latency.
+    /// 2. Unnecessary CPU cycle consumption in native_queued_spin_lock_slowpath (kernel).
+    /// 3. Decreased CPU utilization.
+    ///
+    /// Proposed solution: Split ResizeProcessor when multiple threads are allocated to execute a query.
+    /// Benefits:
+    /// 1. Mitigates lock contention.
+    /// 2. Maintains ResizeProcessor's benefit of balancing data flow among multiple streams.
+    ///
+    /// Disable this optimization when min_outstreams_per_resize_after_split is 0
+    if (output_ports.size() > 1 && min_outstreams_per_resize_after_split != 0 && num_streams / min_outstreams_per_resize_after_split > 1)
+    {
+        addSplitResizeTransform(num_streams, min_outstreams_per_resize_after_split, strict);
+        return;
+    }
+    if (strict && num_streams == numOutputPorts())
+        return;
+
     ProcessorPtr resize;
 
     if (strict)
diff --git a/src/QueryPipeline/Pipe.h b/src/QueryPipeline/Pipe.h
@@ -90,7 +90,7 @@ class Pipe
     void addChains(std::vector<Chain> chains);
 
     /// Changes the number of output ports if needed. Adds (Strict)ResizeProcessor.
-    void resize(size_t num_streams, bool strict = false);
+    void resize(size_t num_streams, bool strict = false, UInt64 min_outstreams_per_resize_after_split = 0);
 
     using Transformer = std::function<Processors(OutputPortRawPtrs ports)>;
 
@@ -130,6 +130,7 @@ class Pipe
     bool isCompleted() const { return !empty() && output_ports.empty(); }
     static Pipe unitePipes(Pipes pipes, Processors * collected_processors, bool allow_empty_header);
     void setSinks(const Pipe::ProcessorGetterWithStreamKind & getter);
+    void addSplitResizeTransform(size_t num_streams, size_t min_outstreams_per_resize_after_split, bool strict);
 
     friend class QueryPipelineBuilder;
     friend class QueryPipeline;
diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp
@@ -193,10 +193,10 @@ void QueryPipelineBuilder::addMergingAggregatedMemoryEfficientTransform(Aggregat
     DB::addMergingAggregatedMemoryEfficientTransform(pipe, std::move(params), num_merging_processors);
 }
 
-void QueryPipelineBuilder::resize(size_t num_streams, bool strict)
+void QueryPipelineBuilder::resize(size_t num_streams, bool strict, UInt64 min_outstreams_per_resize_after_split)
 {
     checkInitializedAndNotCompleted();
-    pipe.resize(num_streams, strict);
+    pipe.resize(num_streams, strict, min_outstreams_per_resize_after_split);
 }
 
 void QueryPipelineBuilder::narrow(size_t size)
diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h
@@ -97,7 +97,7 @@ class QueryPipelineBuilder
     void addMergingAggregatedMemoryEfficientTransform(AggregatingTransformParamsPtr params, size_t num_merging_processors);
 
     /// Changes the number of output ports if needed. Adds ResizeTransform.
-    void resize(size_t num_streams, bool strict = false);
+    void resize(size_t num_streams, bool strict = false, UInt64 min_outstreams_per_resize_after_split = 0);
 
     /// Concat some ports to have no more then size outputs.
     /// This method is needed for Merge table engine in case of reading from many tables.

Original file line number	Diff line number	Diff line change
`@@ -108,6 +108,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory()`
`108`	`108`	`{"input_format_parquet_allow_geoparquet_parser", false, true, "A new setting to use geo columns in parquet file"},`
`109`	`109`	`{"enable_url_encoding", true, false, "Changed existing setting's default value"},`
`110`	`110`	`{"s3_slow_all_threads_after_network_error", false, true, "New setting"},`
	`111`	`+ {"min_outstreams_per_resize_after_split", 0, 24, "New setting."},`
`111`	`112`	`});`
`112`	`113`	`addSettingsChanges(settings_changes_history, "25.4",`
`113`	`114`	`{`
Original file line number	Diff line number	Diff line change
`@@ -193,10 +193,10 @@ void QueryPipelineBuilder::addMergingAggregatedMemoryEfficientTransform(Aggregat`
`193`	`193`	`DB::addMergingAggregatedMemoryEfficientTransform(pipe, std::move(params), num_merging_processors);`
`194`	`194`	`}`
`195`	`195`
`196`		`-void QueryPipelineBuilder::resize(size_t num_streams, bool strict)`
	`196`	`+void QueryPipelineBuilder::resize(size_t num_streams, bool strict, UInt64 min_outstreams_per_resize_after_split)`
`197`	`197`	`{`
`198`	`198`	`checkInitializedAndNotCompleted();`
`199`		`- pipe.resize(num_streams, strict);`
	`199`	`+ pipe.resize(num_streams, strict, min_outstreams_per_resize_after_split);`
`200`	`200`	`}`
`201`	`201`
`202`	`202`	`void QueryPipelineBuilder::narrow(size_t size)`