Skip to content

Commit db9a344

Browse files
committed
sql/stats: merge all arbitrary partial and full table statistics
This commit generalizes partial and full stat merging in the stats cache to support partial stats collected over any range of values in a column. Non-extreme partial stats are treated as covering a single span, and extreme partials continue to separate the lower and upper extreme buckets as before. Additionally, merged stats now apply all partial stats collected after the latest full stat, in order of creation. Previously, we could only merge partial stats collected at the extremes of the index, and would only merge the latest one. Part of: #93998 Release note (sql change): The optimizer can now use table statistics that merge the latest full statistic with all newer partial statistics, including those over arbitrary constraints over a single span.
1 parent 552cfcb commit db9a344

File tree

17 files changed

+1468
-515
lines changed

17 files changed

+1468
-515
lines changed

pkg/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2338,6 +2338,7 @@ GO_TARGETS = [
23382338
"//pkg/sql/sem/plpgsqltree:plpgsqltree",
23392339
"//pkg/sql/sem/semenumpb:semenumpb",
23402340
"//pkg/sql/sem/transform:transform",
2341+
"//pkg/sql/sem/tree/datumrange:datumrange",
23412342
"//pkg/sql/sem/tree/evalgen:evalgen",
23422343
"//pkg/sql/sem/tree/evalgen:evalgen_lib",
23432344
"//pkg/sql/sem/tree/treebin:treebin",

pkg/sql/distsql_plan_stats.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,10 @@ func (dsp *DistSQLPlanner) createPartialStatsPlan(
304304
return nil, pgerror.Newf(pgcode.FeatureNotSupported, "multi-column partial statistics are not currently supported")
305305
}
306306

307+
if !reqStat.histogram {
308+
return nil, pgerror.Newf(pgcode.FeatureNotSupported, "partial statistics without histograms are not supported")
309+
}
310+
307311
var typeResolver *descs.DistSQLTypeResolver
308312
if p := planCtx.planner; p != nil {
309313
r := descs.NewDistSQLTypeResolver(p.Descriptors(), p.Txn())
@@ -378,6 +382,13 @@ func (dsp *DistSQLPlanner) createPartialStatsPlan(
378382
"column %s does not have a prior statistic",
379383
column.GetName())
380384
}
385+
if len(stat.Histogram) == 1 && stat.Histogram[0].UpperBound == tree.DNull {
386+
return nil, pgerror.Newf(
387+
pgcode.ObjectNotInPrerequisiteState,
388+
"the latest full statistic histogram for column %s has only NULL values",
389+
column.GetName(),
390+
)
391+
}
381392

382393
var predicate string
383394
var prevLowerBound tree.Datum

pkg/sql/logictest/testdata/logic_test/constrained_stats

Lines changed: 6 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -64,31 +64,6 @@ upper_bound range_rows distinct_range_rows equal_rows
6464
50.00 0 0 1
6565
55.00 0 0 1
6666

67-
statement ok
68-
CREATE STATISTICS stat_price_ranges ON price FROM products WHERE price <= 20.00 OR price > 40.00
69-
70-
query TTIIIT colnames
71-
SELECT statistics_name, column_names, row_count, distinct_count, null_count, partial_predicate
72-
FROM [SHOW STATISTICS FOR TABLE products]
73-
WHERE statistics_name = 'stat_price_ranges'
74-
----
75-
statistics_name column_names row_count distinct_count null_count partial_predicate
76-
stat_price_ranges {price} 6 6 0 (price <= 20.00) OR (price > 40.00)
77-
78-
let $hist_stat_price_ranges
79-
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE products] WHERE statistics_name = 'stat_price_ranges';
80-
81-
query TIRI colnames,nosort
82-
SHOW HISTOGRAM $hist_stat_price_ranges
83-
----
84-
upper_bound range_rows distinct_range_rows equal_rows
85-
10.00 0 0 1
86-
15.00 0 0 1
87-
20.00 0 0 1
88-
45.00 0 0 1
89-
50.00 0 0 1
90-
55.00 0 0 1
91-
9267
statement ok
9368
CREATE STATISTICS stat_active_products ON status FROM products WHERE status = 'active'
9469

@@ -111,17 +86,6 @@ WHERE statistics_name = 'stat_mid_range_products'
11186
statistics_name column_names row_count distinct_count null_count partial_predicate
11287
stat_mid_range_products {category_id} 8 2 0 category_id BETWEEN 1 AND 2
11388

114-
statement ok
115-
CREATE STATISTICS stat_specific_categories ON category_id FROM products WHERE category_id IN (1, 3)
116-
117-
query TTIIIT colnames
118-
SELECT statistics_name, column_names, row_count, distinct_count, null_count, partial_predicate
119-
FROM [SHOW STATISTICS FOR TABLE products]
120-
WHERE statistics_name = 'stat_specific_categories'
121-
----
122-
statistics_name column_names row_count distinct_count null_count partial_predicate
123-
stat_specific_categories {category_id} 6 2 0 category_id IN (1, 3)
124-
12589
# Verify error cases
12690
statement error pq: partial statistics with WHERE must be on a single column
12791
CREATE STATISTICS stat_multi_col ON category_id, price FROM products WHERE status = 'active'
@@ -147,6 +111,12 @@ CREATE STATISTICS stat_subquery ON category_id FROM products WHERE category_id I
147111
statement error pq: predicate could not become a constrained scan of an index
148112
CREATE STATISTICS stat_func ON status FROM products WHERE length(status) > 6
149113

114+
statement error pq: unimplemented: CREATE STATISTICS with a WHERE clause that produces multiple spans is not yet supported
115+
CREATE STATISTICS stat_func ON price FROM products WHERE price < 20.00 OR price > 30.00
116+
117+
statement error pq: unimplemented: CREATE STATISTICS with a WHERE clause that produces multiple spans is not yet supported
118+
CREATE STATISTICS stat_specific_categories ON category_id FROM products WHERE category_id IN (1, 3)
119+
150120
statement error syntax error
151121
CREATE STATISTICS stat_error2 ON price FROM products WHERE price >
152122

pkg/sql/logictest/testdata/logic_test/distsql_stats

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2675,6 +2675,18 @@ RESET enable_create_stats_using_extremes
26752675
statement error pq: multi-column partial statistics are not currently supported
26762676
CREATE STATISTICS abcd_a_b ON a, c FROM abcd USING EXTREMES;
26772677

2678+
statement ok
2679+
SET CLUSTER SETTING sql.stats.histogram_collection.enabled = false
2680+
2681+
statement error pq: partial statistics without histograms are not supported
2682+
CREATE STATISTICS abcd_a_b ON a FROM abcd USING EXTREMES;
2683+
2684+
statement error pq: partial statistics without histograms are not supported
2685+
CREATE STATISTICS abcd_a_b ON a FROM abcd WHERE a > 5;
2686+
2687+
statement ok
2688+
RESET CLUSTER SETTING sql.stats.histogram_collection.enabled
2689+
26782690
# Verify that a non-inverted index string column with a string histogram
26792691
# can have partial statistics
26802692
statement ok
@@ -2730,9 +2742,15 @@ CREATE TABLE xyz (x INT, y INT, z INT, INDEX (x, y));
27302742
statement error pq: column x does not have a prior statistic
27312743
EXPLAIN ANALYZE CREATE STATISTICS xyz_x ON x FROM xyz USING EXTREMES;
27322744

2745+
statement error pq: column x does not have a prior statistic
2746+
EXPLAIN ANALYZE CREATE STATISTICS xyz_x ON x FROM xyz WHERE x > 5;
2747+
27332748
statement error pq: the latest full statistic for column a has no histogram
27342749
EXPLAIN ANALYZE CREATE STATISTICS u_partial ON a FROM u USING EXTREMES;
27352750

2751+
statement error pq: the latest full statistic for column a has no histogram
2752+
EXPLAIN ANALYZE CREATE STATISTICS u_partial ON a FROM u WHERE a > 5;
2753+
27362754
statement error pq: table xy does not contain a non-partial forward index with y as a prefix column
27372755
CREATE STATISTICS xy_y_partial ON y FROM xy USING EXTREMES;
27382756

@@ -2750,9 +2768,12 @@ CREATE STATISTICS only_null_stat ON a FROM only_null;
27502768
statement ok
27512769
SELECT crdb_internal.clear_table_stats_cache();
27522770

2753-
statement error pq: only outer or NULL bounded buckets exist in only_null@only_null_a_idx \(table ID \d+, column IDs \[1\]\), so partial stats cannot be collected
2771+
statement error pq: the latest full statistic histogram for column a has only NULL values
27542772
EXPLAIN ANALYZE CREATE STATISTICS only_null_partial ON a FROM only_null USING EXTREMES;
27552773

2774+
statement error pq: the latest full statistic histogram for column a has only NULL values
2775+
EXPLAIN ANALYZE CREATE STATISTICS only_null_partial ON a FROM only_null WHERE a > 5;
2776+
27562777
statement ok
27572778
CREATE INDEX ON xy (y) WHERE y > 5;
27582779

pkg/sql/opt/cat/table.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,10 +271,8 @@ type TableStatistic interface {
271271
// inverted index histograms, this will always return types.Bytes.
272272
HistogramType() *types.T
273273

274-
// IsPartial returns true if this statistic was collected with a where
275-
// clause. (If the where clause was something like "WHERE 1 = 1" or "WHERE
276-
// true" this could technically be a full statistic rather than a partial
277-
// statistic, but this function does not check for this.)
274+
// IsPartial returns true if this statistic was collected with USING EXTREMES
275+
// or with a WHERE clause.
278276
IsPartial() bool
279277

280278
// IsMerged returns true if this statistic was created by merging a partial

pkg/sql/opt/exec/execbuilder/testdata/partial_stats

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,3 +1478,125 @@ vectorized: true
14781478
spans: [/10 - /10]
14791479

14801480
subtest end
1481+
1482+
# Ensure that constrained partial statistics are merged into full statistics
1483+
# and used by the optimizer.
1484+
1485+
statement ok
1486+
CREATE TABLE m (b INT PRIMARY KEY)
1487+
1488+
statement ok
1489+
ALTER TABLE m INJECT STATISTICS '[
1490+
{
1491+
"id": 1,
1492+
"avg_size": 1,
1493+
"columns": ["b"],
1494+
"created_at": "2000-01-01 00:00:00.000000",
1495+
"distinct_count": 4,
1496+
"histo_buckets": [
1497+
{"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "10"},
1498+
{"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "20"},
1499+
{"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "30"},
1500+
{"distinct_range": 0, "num_eq": 1, "num_range": 0, "upper_bound": "40"}
1501+
],
1502+
"histo_col_type": "INT8",
1503+
"histo_version": 2,
1504+
"name": "full",
1505+
"null_count": 0,
1506+
"row_count": 4
1507+
},
1508+
{
1509+
"id": 2,
1510+
"avg_size": 1,
1511+
"columns": ["b"],
1512+
"created_at": "2000-01-02 00:00:00.000000",
1513+
"distinct_count": 4,
1514+
"histo_buckets": [
1515+
{"distinct_range": 0, "num_eq": 3, "num_range": 0, "upper_bound": "20"},
1516+
{"distinct_range": 2, "num_eq": 2, "num_range": 3, "upper_bound": "30"}
1517+
],
1518+
"histo_col_type": "INT8",
1519+
"histo_version": 2,
1520+
"name": "partial",
1521+
"null_count": 0,
1522+
"partial_predicate": "((b > 15:::INT8) AND (b < 35:::INT8))",
1523+
"row_count": 8,
1524+
"full_statistic_id": 1
1525+
}
1526+
]'
1527+
1528+
query TTTIIII colnames
1529+
SELECT statistics_name, column_names, created, row_count, distinct_count, null_count, avg_size
1530+
FROM [SHOW STATISTICS FOR TABLE m WITH MERGE]
1531+
ORDER BY created
1532+
----
1533+
statistics_name column_names created row_count distinct_count null_count avg_size
1534+
full {b} 2000-01-01 00:00:00 +0000 UTC 4 4 0 1
1535+
partial {b} 2000-01-02 00:00:00 +0000 UTC 8 4 0 1
1536+
__merged__ {b} 2000-01-02 00:00:00 +0000 UTC 10 6 0 1
1537+
1538+
query T
1539+
SELECT jsonb_pretty(stat)
1540+
FROM (
1541+
SELECT jsonb_array_elements(statistics) AS stat FROM [SHOW STATISTICS USING JSON FOR TABLE m WITH MERGE]
1542+
)
1543+
WHERE stat->>'name' = '__merged__';
1544+
----
1545+
{
1546+
"avg_size": 1,
1547+
"columns": [
1548+
"b"
1549+
],
1550+
"created_at": "2000-01-02 00:00:00",
1551+
"distinct_count": 6,
1552+
"histo_buckets": [
1553+
{
1554+
"distinct_range": 0,
1555+
"num_eq": 1,
1556+
"num_range": 0,
1557+
"upper_bound": "10"
1558+
},
1559+
{
1560+
"distinct_range": 0,
1561+
"num_eq": 3,
1562+
"num_range": 0,
1563+
"upper_bound": "20"
1564+
},
1565+
{
1566+
"distinct_range": 2,
1567+
"num_eq": 2,
1568+
"num_range": 3,
1569+
"upper_bound": "30"
1570+
},
1571+
{
1572+
"distinct_range": 0,
1573+
"num_eq": 1,
1574+
"num_range": 0,
1575+
"upper_bound": "40"
1576+
}
1577+
],
1578+
"histo_col_type": "INT8",
1579+
"histo_version": 3,
1580+
"name": "__merged__",
1581+
"null_count": 0,
1582+
"row_count": 10
1583+
}
1584+
1585+
# Now clear the stats cache so that the query below is guaranteed to pick up the
1586+
# new stats (partial and merged).
1587+
statement ok
1588+
SELECT crdb_internal.clear_table_stats_cache();
1589+
1590+
query T
1591+
EXPLAIN (OPT, VERBOSE) SELECT * FROM m WHERE b < 35
1592+
----
1593+
scan m
1594+
├── columns: b:1
1595+
├── constraint: /1: [ - /34]
1596+
├── stats: [rows=9, distinct(1)=5, null(1)=0]
1597+
│ histogram(1)= 0 1 0 3 3 2 0 0
1598+
│ <--- 10 --- 20 --- 30 --- 34
1599+
├── cost: 27.11
1600+
├── cost-flags: unbounded-cardinality
1601+
├── key: (1)
1602+
└── distribution: test

pkg/sql/opt/optbuilder/misc_statements.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,5 +323,10 @@ func (b *Builder) buildWhereForStatistics(
323323
"predicate is a contradiction"))
324324
}
325325

326+
if cons.Spans.Count() > 1 {
327+
panic(unimplemented.NewWithIssue(154040,
328+
"CREATE STATISTICS with a WHERE clause that produces multiple spans is not yet supported"))
329+
}
330+
326331
return indexOrd, &cons
327332
}

pkg/sql/opt/props/BUILD.bazel

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,12 @@ go_library(
2323
"//pkg/sql/opt",
2424
"//pkg/sql/opt/cat",
2525
"//pkg/sql/opt/constraint",
26-
"//pkg/sql/rowenc/keyside",
2726
"//pkg/sql/sem/eval",
2827
"//pkg/sql/sem/tree",
28+
"//pkg/sql/sem/tree/datumrange",
2929
"//pkg/sql/sem/volatility",
3030
"//pkg/sql/types",
3131
"//pkg/util/buildutil",
32-
"//pkg/util/encoding",
33-
"//pkg/util/timetz",
3432
"@com_github_cockroachdb_errors//:errors",
3533
"@com_github_cockroachdb_redact//:redact",
3634
"@com_github_olekukonko_tablewriter//:tablewriter",

0 commit comments

Comments
 (0)