Skip to content

Commit f4bccfc

Browse files
committed
opt: estimate worst-case selectivity of placeholder equalities
Previously, we calculated the selectivity of placeholder equality filters, e.g., `x = $1`, using the distinct count of a column and total row count. This represented an average-case selectivity. Now, we instead estimate the worst-case selectivity using the maximum frequency of the histogram of the constrained column. This helps avoid choosing a generic query plan under `plan_cache_mode=auto` that performs poorly for heavy-hitter placeholder values. Fixes #151373 Release note (performance improvement): The cost of generic query plans is now calculated based on worst-case selectivities for placeholder equalities (e.g., x = $1). This reduces the chance of suboptimal generic query plans being chosen when `plan_cache_mode=auto`.
1 parent e7cb6d4 commit f4bccfc

File tree

15 files changed

+523
-80
lines changed

15 files changed

+523
-80
lines changed

pkg/sql/opt/exec/execbuilder/testdata/explain_redact

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -697,7 +697,7 @@ upsert bc
697697
query T
698698
EXPLAIN (OPT, MEMO, REDACT) INSERT INTO bc SELECT a::float + 1 FROM a ON CONFLICT (b) DO UPDATE SET b = bc.b + 100
699699
----
700-
memo (optimized, ~36KB, required=[presentation: info:25] [distribution: test])
700+
memo (optimized, ~37KB, required=[presentation: info:25] [distribution: test])
701701
├── G1: (explain G2 [distribution: test])
702702
│ └── [presentation: info:25] [distribution: test]
703703
│ ├── best: (explain G2="[distribution: test]" [distribution: test])
@@ -2497,7 +2497,7 @@ project
24972497
query T
24982498
EXPLAIN (OPT, MEMO, REDACT) SELECT * FROM bc JOIN f ON b = f + 1
24992499
----
2500-
memo (optimized, ~28KB, required=[presentation: info:14] [distribution: test])
2500+
memo (optimized, ~29KB, required=[presentation: info:14] [distribution: test])
25012501
├── G1: (explain G2 [presentation: b:1,c:2,f:7] [distribution: test])
25022502
│ └── [presentation: info:14] [distribution: test]
25032503
│ ├── best: (explain G2="[presentation: b:1,c:2,f:7] [distribution: test]" [presentation: b:1,c:2,f:7] [distribution: test])

pkg/sql/opt/memo/expr_format.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -885,12 +885,20 @@ func (f *ExprFmtCtx) formatRelational(e RelExpr, tp treeprinter.Node) {
885885
if relational.HasPlaceholder {
886886
writeFlag("has-placeholder")
887887
}
888+
if p, ok := e.Private().(*JoinPrivate); ok {
889+
if !p.ParameterizedCols.Empty() {
890+
tp.Childf("parameterized columns: %s", p.ParameterizedCols)
891+
}
892+
}
888893
if lookupJoin, ok := e.(*LookupJoinExpr); ok {
889894
// For lookup joins, indicate whether reverse scans are required to
890895
// satisfy the ordering.
891896
if lookupJoinMustUseReverseScans(md, lookupJoin, &required.Ordering) {
892897
writeFlag("reverse-scans")
893898
}
899+
if !lookupJoin.ParameterizedCols.Empty() {
900+
tp.Childf("parameterized columns: %s", lookupJoin.ParameterizedCols)
901+
}
894902
}
895903

896904
if f.Buffer.Len() != 0 {

pkg/sql/opt/memo/statistics_builder.go

Lines changed: 139 additions & 42 deletions
Large diffs are not rendered by default.

pkg/sql/opt/memo/testdata/stats/generic

Lines changed: 298 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@ exec-ddl
22
CREATE TABLE t (
33
k INT PRIMARY KEY,
44
i INT,
5-
s STRING
5+
s STRING,
6+
INDEX (i)
67
)
78
----
89

10+
# ------------------------
11+
# Tests without Histograms
12+
# ------------------------
13+
914
exec-ddl
1015
ALTER TABLE t INJECT STATISTICS '[
1116
{
@@ -166,3 +171,295 @@ select
166171
│ └── fd: (1)-->(2,3)
167172
└── filters
168173
└── (i:2 = $1) OR (s:3 = $2) [type=bool, outer=(2,3)]
174+
175+
# ---------------------
176+
# Tests with Histograms
177+
# ---------------------
178+
179+
exec-ddl
180+
ALTER TABLE t INJECT STATISTICS '[
181+
{
182+
"columns": ["k"],
183+
"created_at": "2018-01-01 1:00:00.00000+00:00",
184+
"row_count": 1000,
185+
"distinct_count": 1000
186+
},
187+
{
188+
"columns": ["i"],
189+
"created_at": "2018-01-01 1:00:00.00000+00:00",
190+
"row_count": 1000,
191+
"distinct_count": 41,
192+
"null_count": 30,
193+
"avg_size": 2,
194+
"histo_col_type": "int",
195+
"histo_buckets": [
196+
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
197+
{"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "100"},
198+
{"num_eq": 10, "num_range": 180, "distinct_range": 9, "upper_bound": "200"},
199+
{"num_eq": 20, "num_range": 270, "distinct_range": 9, "upper_bound": "300"},
200+
{"num_eq": 30, "num_range": 360, "distinct_range": 9, "upper_bound": "400"}
201+
]
202+
},
203+
{
204+
"columns": ["s"],
205+
"created_at": "2018-01-01 1:00:00.00000+00:00",
206+
"row_count": 1000,
207+
"distinct_count": 20,
208+
"avg_size": 3,
209+
"histo_col_type": "string",
210+
"histo_buckets": [
211+
{"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "apple"},
212+
{"num_eq": 300, "num_range": 100, "distinct_range": 9, "upper_bound": "banana"},
213+
{"num_eq": 500, "num_range": 100, "distinct_range": 9, "upper_bound": "cherry"}
214+
]
215+
}
216+
]'
217+
----
218+
219+
norm
220+
SELECT * FROM t WHERE k = $1
221+
----
222+
select
223+
├── columns: k:1(int!null) i:2(int) s:3(string)
224+
├── cardinality: [0 - 1]
225+
├── has-placeholder
226+
├── stats: [rows=1, distinct(1)=1, null(1)=0]
227+
├── key: ()
228+
├── fd: ()-->(1-3)
229+
├── scan t
230+
│ ├── columns: k:1(int!null) i:2(int) s:3(string)
231+
│ ├── stats: [rows=1000, distinct(1)=1000, null(1)=0]
232+
│ ├── key: (1)
233+
│ └── fd: (1)-->(2,3)
234+
└── filters
235+
└── k:1 = $1 [type=bool, outer=(1), constraints=(/1: (/NULL - ]), fd=()-->(1)]
236+
237+
# The row count of the filter is the max frequency of i's histogram.
238+
norm
239+
SELECT * FROM t WHERE i = $1
240+
----
241+
select
242+
├── columns: k:1(int!null) i:2(int!null) s:3(string)
243+
├── has-placeholder
244+
├── stats: [rows=30, distinct(2)=1, null(2)=0]
245+
├── key: (1)
246+
├── fd: ()-->(2), (1)-->(3)
247+
├── scan t
248+
│ ├── columns: k:1(int!null) i:2(int) s:3(string)
249+
│ ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(2)=41, null(2)=30]
250+
│ │ histogram(2)= 0 30 0 0 90 10 180 10 270 20 360 30
251+
│ │ <--- NULL --- 0 ---- 100 ----- 200 ----- 300 ----- 400
252+
│ ├── key: (1)
253+
│ └── fd: (1)-->(2,3)
254+
└── filters
255+
└── i:2 = $1 [type=bool, outer=(2), constraints=(/2: (/NULL - ]), fd=()-->(2)]
256+
257+
# Similar case as above, but with opt to ensure the correct row counts are used
258+
# for new memo groups.
259+
opt
260+
SELECT k FROM t WHERE i = $1
261+
----
262+
project
263+
├── columns: k:1(int!null)
264+
├── has-placeholder
265+
├── stats: [rows=30]
266+
├── key: (1)
267+
└── placeholder-scan t@t_i_idx
268+
├── columns: k:1(int!null) i:2(int!null)
269+
├── has-placeholder
270+
├── stats: [rows=30, distinct(2)=1, null(2)=0]
271+
├── key: (1)
272+
├── fd: ()-->(2)
273+
└── span
274+
└── $1 [type=int]
275+
276+
# Similar case as above, but with opt to ensure the correct row counts are used
277+
# for new memo groups.
278+
opt
279+
SELECT * FROM t WHERE i = $1
280+
----
281+
project
282+
├── columns: k:1(int!null) i:2(int!null) s:3(string)
283+
├── has-placeholder
284+
├── stats: [rows=30, distinct(2)=1, null(2)=0]
285+
├── key: (1)
286+
├── fd: ()-->(2), (1)-->(3)
287+
└── inner-join (lookup t)
288+
├── columns: k:1(int!null) i:2(int!null) s:3(string) "$1":6(int!null)
289+
├── key columns: [1] = [1]
290+
├── lookup columns are key
291+
├── has-placeholder
292+
├── stats: [rows=30, distinct(2)=1, null(2)=0, distinct(6)=1, null(6)=0]
293+
├── key: (1)
294+
├── fd: ()-->(2,6), (1)-->(3), (2)==(6), (6)==(2)
295+
├── inner-join (lookup t@t_i_idx)
296+
│ ├── columns: k:1(int!null) i:2(int!null) "$1":6(int!null)
297+
│ ├── flags: disallow merge join
298+
│ ├── key columns: [6] = [2]
299+
│ ├── parameterized columns: (6)
300+
│ ├── has-placeholder
301+
│ ├── stats: [rows=30, distinct(2)=1, null(2)=0, distinct(6)=1, null(6)=0]
302+
│ ├── key: (1)
303+
│ ├── fd: ()-->(2,6), (2)==(6), (6)==(2)
304+
│ ├── values
305+
│ │ ├── columns: "$1":6(int)
306+
│ │ ├── cardinality: [1 - 1]
307+
│ │ ├── has-placeholder
308+
│ │ ├── stats: [rows=1, distinct(6)=1, null(6)=0]
309+
│ │ ├── key: ()
310+
│ │ ├── fd: ()-->(6)
311+
│ │ └── ($1,) [type=tuple{int}]
312+
│ └── filters (true)
313+
└── filters (true)
314+
315+
# The row count of the filter is the max frequency of s's histogram.
316+
norm
317+
SELECT * FROM t WHERE $1 = s
318+
----
319+
select
320+
├── columns: k:1(int!null) i:2(int) s:3(string!null)
321+
├── has-placeholder
322+
├── stats: [rows=500, distinct(3)=1, null(3)=0]
323+
├── key: (1)
324+
├── fd: ()-->(3), (1)-->(2)
325+
├── scan t
326+
│ ├── columns: k:1(int!null) i:2(int) s:3(string)
327+
│ ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(3)=20, null(3)=0]
328+
│ │ histogram(3)= 0 0 100 300 100 500
329+
│ │ <--- 'apple' ----- 'banana' ----- 'cherry'
330+
│ ├── key: (1)
331+
│ └── fd: (1)-->(2,3)
332+
└── filters
333+
└── s:3 = $1 [type=bool, outer=(3), constraints=(/3: (/NULL - ]), fd=()-->(3)]
334+
335+
# Similar case to the previous one, but with a join on a values expression to
336+
# mimic a parameterized join of a generic query plan.
337+
# TODO(mgartner): The row count of the inner-join should be 500, because that is
338+
# the maximum frequency of s. It is currently 50 because the v.s is not marked
339+
# as a "parameterized column", which only happens during the
340+
# GenerateParameterizedJoin exploration rule. I think we can address this by
341+
# including paramterized columns in logical properties and propagating them
342+
# upward.
343+
norm
344+
SELECT * FROM (VALUES ($1::STRING)) v(s) JOIN t ON t.s = v.s
345+
----
346+
inner-join (hash)
347+
├── columns: s:1(string!null) k:2(int!null) i:3(int) s:4(string!null)
348+
├── multiplicity: left-rows(zero-or-more), right-rows(zero-or-one)
349+
├── has-placeholder
350+
├── stats: [rows=50, distinct(1)=1, null(1)=0, distinct(4)=1, null(4)=0]
351+
├── key: (2)
352+
├── fd: ()-->(1,4), (2)-->(3), (1)==(4), (4)==(1)
353+
├── values
354+
│ ├── columns: column1:1(string)
355+
│ ├── cardinality: [1 - 1]
356+
│ ├── has-placeholder
357+
│ ├── stats: [rows=1, distinct(1)=1, null(1)=0]
358+
│ ├── key: ()
359+
│ ├── fd: ()-->(1)
360+
│ └── ($1,) [type=tuple{string}]
361+
├── scan t
362+
│ ├── columns: k:2(int!null) i:3(int) s:4(string)
363+
│ ├── stats: [rows=1000, distinct(4)=20, null(4)=0]
364+
│ │ histogram(4)= 0 0 100 300 100 500
365+
│ │ <--- 'apple' ----- 'banana' ----- 'cherry'
366+
│ ├── key: (2)
367+
│ └── fd: (2)-->(3,4)
368+
└── filters
369+
└── s:4 = column1:1 [type=bool, outer=(1,4), constraints=(/1: (/NULL - ]; /4: (/NULL - ]), fd=(1)==(4), (4)==(1)]
370+
371+
# The row count of the filter is based on the product of selectivities from the
372+
# max frequencies of i's and s's histograms.
373+
norm
374+
SELECT * FROM t WHERE i = $1 AND s = $2
375+
----
376+
select
377+
├── columns: k:1(int!null) i:2(int!null) s:3(string!null)
378+
├── has-placeholder
379+
├── stats: [rows=15, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(2,3)=1, null(2,3)=0]
380+
├── key: (1)
381+
├── fd: ()-->(2,3)
382+
├── scan t
383+
│ ├── columns: k:1(int!null) i:2(int) s:3(string)
384+
│ ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(2)=41, null(2)=30, distinct(3)=20, null(3)=0, distinct(2,3)=820, null(2,3)=0]
385+
│ │ histogram(2)= 0 30 0 0 90 10 180 10 270 20 360 30
386+
│ │ <--- NULL --- 0 ---- 100 ----- 200 ----- 300 ----- 400
387+
│ │ histogram(3)= 0 0 100 300 100 500
388+
│ │ <--- 'apple' ----- 'banana' ----- 'cherry'
389+
│ ├── key: (1)
390+
│ └── fd: (1)-->(2,3)
391+
└── filters
392+
├── i:2 = $1 [type=bool, outer=(2), constraints=(/2: (/NULL - ]), fd=()-->(2)]
393+
└── s:3 = $2 [type=bool, outer=(3), constraints=(/3: (/NULL - ]), fd=()-->(3)]
394+
395+
norm
396+
SELECT * FROM t WHERE i > $1
397+
----
398+
select
399+
├── columns: k:1(int!null) i:2(int!null) s:3(string)
400+
├── has-placeholder
401+
├── stats: [rows=323.333, distinct(2)=41, null(2)=0]
402+
├── key: (1)
403+
├── fd: (1)-->(2,3)
404+
├── scan t
405+
│ ├── columns: k:1(int!null) i:2(int) s:3(string)
406+
│ ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(2)=41, null(2)=30]
407+
│ │ histogram(2)= 0 30 0 0 90 10 180 10 270 20 360 30
408+
│ │ <--- NULL --- 0 ---- 100 ----- 200 ----- 300 ----- 400
409+
│ ├── key: (1)
410+
│ └── fd: (1)-->(2,3)
411+
└── filters
412+
└── i:2 > $1 [type=bool, outer=(2), constraints=(/2: (/NULL - ])]
413+
414+
norm
415+
SELECT * FROM t WHERE i = $1 OR i = $2
416+
----
417+
select
418+
├── columns: k:1(int!null) i:2(int!null) s:3(string)
419+
├── has-placeholder
420+
├── stats: [rows=323.333, distinct(2)=41, null(2)=0]
421+
├── key: (1)
422+
├── fd: (1)-->(2,3)
423+
├── scan t
424+
│ ├── columns: k:1(int!null) i:2(int) s:3(string)
425+
│ ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(2)=41, null(2)=30]
426+
│ │ histogram(2)= 0 30 0 0 90 10 180 10 270 20 360 30
427+
│ │ <--- NULL --- 0 ---- 100 ----- 200 ----- 300 ----- 400
428+
│ ├── key: (1)
429+
│ └── fd: (1)-->(2,3)
430+
└── filters
431+
└── (i:2 = $1) OR (i:2 = $2) [type=bool, outer=(2), constraints=(/2: (/NULL - ])]
432+
433+
norm
434+
SELECT * FROM t WHERE i IN ($1, $2, $3)
435+
----
436+
select
437+
├── columns: k:1(int!null) i:2(int) s:3(string)
438+
├── has-placeholder
439+
├── stats: [rows=333.333]
440+
├── key: (1)
441+
├── fd: (1)-->(2,3)
442+
├── scan t
443+
│ ├── columns: k:1(int!null) i:2(int) s:3(string)
444+
│ ├── stats: [rows=1000, distinct(1)=1000, null(1)=0]
445+
│ ├── key: (1)
446+
│ └── fd: (1)-->(2,3)
447+
└── filters
448+
└── i:2 IN ($1, $2, $3) [type=bool, outer=(2)]
449+
450+
norm
451+
SELECT * FROM t WHERE i = $1 OR s = $2
452+
----
453+
select
454+
├── columns: k:1(int!null) i:2(int) s:3(string)
455+
├── has-placeholder
456+
├── stats: [rows=333.333]
457+
├── key: (1)
458+
├── fd: (1)-->(2,3)
459+
├── scan t
460+
│ ├── columns: k:1(int!null) i:2(int) s:3(string)
461+
│ ├── stats: [rows=1000, distinct(1)=1000, null(1)=0]
462+
│ ├── key: (1)
463+
│ └── fd: (1)-->(2,3)
464+
└── filters
465+
└── (i:2 = $1) OR (s:3 = $2) [type=bool, outer=(2,3)]

pkg/sql/opt/ops/relational.opt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,11 @@ define JoinPrivate {
307307
# SkipReorderJoins indicates whether the ReorderJoins rule should match this
308308
# join.
309309
SkipReorderJoins bool
310+
311+
# ParameterizedCols is the set of columns that are equivalent to placeholder
312+
# values. These columns are typically created when exploring parameterized
313+
# joins for generic query plans.
314+
ParameterizedCols ColSet
310315
}
311316

312317
# IndexJoin represents an inner join between an input expression and a primary

pkg/sql/opt/props/selectivity.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,16 @@ func (s *Selectivity) Divide(other Selectivity) {
8989

9090
// MinSelectivity returns the smaller value of two selectivities.
9191
func MinSelectivity(a, b Selectivity) Selectivity {
92-
if a.selectivity < b.selectivity {
93-
return a
92+
return Selectivity{
93+
selectivity: min(a.selectivity, b.selectivity),
94+
}
95+
}
96+
97+
// MinSelectivity3 returns the smallest value of three selectivities.
98+
func MinSelectivity3(a, b, c Selectivity) Selectivity {
99+
return Selectivity{
100+
selectivity: min(a.selectivity, b.selectivity, c.selectivity),
94101
}
95-
return b
96102
}
97103

98104
// MaxSelectivity returns the larger value of two selectivities.

0 commit comments

Comments
 (0)