Fix: Prevent excessive sampling on QEs by restricting ComputeExtStatisticsRows to QD

yjhjstz · yjhjstz · commit 3ead998707df · 2025-08-13T23:31:29.000+08:00
In `do_analyze_rel`, the function `ComputeExtStatisticsRows` calculates the minimum
number of sample rows needed for extended statistics (e.g., dependencies, ndistinct).

This calculation is only meaningful and required on the Query Dispatcher (QD), since
only the QD is responsible for coordinating the final extended statistics generation.

Previously, all segments (including QEs) executed this logic, resulting in excessive
sampling. For large tables, this caused the QD to receive more rows than it can handle,
leading to the error:

    ERROR: too many sample rows received from gp_acquire_sample_rows
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
@@ -717,8 +717,10 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 	 * statistics target. So we may need to sample more rows and then build
 	 * the statistics with enough detail.
 	 */
-	minrows = ComputeExtStatisticsRows(onerel, attr_cnt, vacattrstats);
-	
+	if (IS_QD_OR_SINGLENODE())
+		minrows = ComputeExtStatisticsRows(onerel, attr_cnt, vacattrstats);
+	else
+		minrows = 0;
 
 	if (targrows < minrows)
 		targrows = minrows;
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
@@ -3240,3 +3240,16 @@ NOTICE:  drop cascades to 2 other objects
 DETAIL:  drop cascades to table tststats.priv_test_tbl
 drop cascades to view tststats.priv_test_view
 DROP USER regress_stats_user1;
+-- test analyze with extended statistics 
+CREATE TABLE tbl_issue1293 (col1 int, col2 int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'col1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO tbl_issue1293
+SELECT i / 10000, i / 100000
+FROM generate_series(1, 1000000) s(i);
+ANALYZE tbl_issue1293;
+-- Create extended statistics on col1, col2
+CREATE STATISTICS s1 (dependencies) ON col1, col2 FROM tbl_issue1293;
+-- Trigger extended stats collection
+ANALYZE tbl_issue1293;
+DROP TABLE tbl_issue1293;
diff --git a/src/test/regress/expected/stats_ext_optimizer.out b/src/test/regress/expected/stats_ext_optimizer.out
@@ -3275,3 +3275,16 @@ NOTICE:  drop cascades to 2 other objects
 DETAIL:  drop cascades to table tststats.priv_test_tbl
 drop cascades to view tststats.priv_test_view
 DROP USER regress_stats_user1;
+-- test analyze with extended statistics 
+CREATE TABLE tbl_issue1293 (col1 int, col2 int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'col1' as the Apache Cloudberry data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO tbl_issue1293
+SELECT i / 10000, i / 100000
+FROM generate_series(1, 1000000) s(i);
+ANALYZE tbl_issue1293;
+-- Create extended statistics on col1, col2
+CREATE STATISTICS s1 (dependencies) ON col1, col2 FROM tbl_issue1293;
+-- Trigger extended stats collection
+ANALYZE tbl_issue1293;
+DROP TABLE tbl_issue1293;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
@@ -1651,3 +1651,15 @@ DROP FUNCTION op_leak(int, int);
 RESET SESSION AUTHORIZATION;
 DROP SCHEMA tststats CASCADE;
 DROP USER regress_stats_user1;
+
+-- test analyze with extended statistics 
+CREATE TABLE tbl_issue1293 (col1 int, col2 int);
+INSERT INTO tbl_issue1293
+SELECT i / 10000, i / 100000
+FROM generate_series(1, 1000000) s(i);
+ANALYZE tbl_issue1293;
+-- Create extended statistics on col1, col2
+CREATE STATISTICS s1 (dependencies) ON col1, col2 FROM tbl_issue1293;
+-- Trigger extended stats collection
+ANALYZE tbl_issue1293;
+DROP TABLE tbl_issue1293;