cockroachdb
diff --git a/‎pkg/sql/opt/exec/execbuilder/testdata/explain_redact‎
Lines changed: 2 additions & 2 deletions b/‎pkg/sql/opt/exec/execbuilder/testdata/explain_redact‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pkg/sql/opt/memo/expr_format.go‎
Lines changed: 8 additions & 0 deletions b/‎pkg/sql/opt/memo/expr_format.go‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎pkg/sql/opt/memo/statistics_builder.go‎
Lines changed: 139 additions & 42 deletions b/‎pkg/sql/opt/memo/statistics_builder.go‎
Lines changed: 139 additions & 42 deletions
diff --git a/‎pkg/sql/opt/memo/testdata/stats/generic‎
Lines changed: 298 additions & 1 deletion b/‎pkg/sql/opt/memo/testdata/stats/generic‎
Lines changed: 298 additions & 1 deletion
diff --git a/‎pkg/sql/opt/ops/relational.opt‎
Lines changed: 5 additions & 0 deletions b/‎pkg/sql/opt/ops/relational.opt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎pkg/sql/opt/props/selectivity.go‎
Lines changed: 9 additions & 3 deletions b/‎pkg/sql/opt/props/selectivity.go‎
Lines changed: 9 additions & 3 deletions
@@ -697,7 +697,7 @@ upsert bc
 query T
 EXPLAIN (OPT, MEMO, REDACT) INSERT INTO bc SELECT a::float + 1 FROM a ON CONFLICT (b) DO UPDATE SET b = bc.b + 100
 ----
-memo (optimized, ~36KB, required=[presentation: info:25] [distribution: test])
+memo (optimized, ~37KB, required=[presentation: info:25] [distribution: test])
  ├── G1: (explain G2 [distribution: test])
  │    └── [presentation: info:25] [distribution: test]
  │         ├── best: (explain G2="[distribution: test]" [distribution: test])
@@ -2497,7 +2497,7 @@ project
 query T
 EXPLAIN (OPT, MEMO, REDACT) SELECT * FROM bc JOIN f ON b = f + 1
 ----
-memo (optimized, ~28KB, required=[presentation: info:14] [distribution: test])
+memo (optimized, ~29KB, required=[presentation: info:14] [distribution: test])
  ├── G1: (explain G2 [presentation: b:1,c:2,f:7] [distribution: test])
  │    └── [presentation: info:14] [distribution: test]
  │         ├── best: (explain G2="[presentation: b:1,c:2,f:7] [distribution: test]" [presentation: b:1,c:2,f:7] [distribution: test])
 
@@ -885,12 +885,20 @@ func (f *ExprFmtCtx) formatRelational(e RelExpr, tp treeprinter.Node) {
 		if relational.HasPlaceholder {
 			writeFlag("has-placeholder")
 		}
+		if p, ok := e.Private().(*JoinPrivate); ok {
+			if !p.ParameterizedCols.Empty() {
+				tp.Childf("parameterized columns: %s", p.ParameterizedCols)
+			}
+		}
 		if lookupJoin, ok := e.(*LookupJoinExpr); ok {
 			// For lookup joins, indicate whether reverse scans are required to
 			// satisfy the ordering.
 			if lookupJoinMustUseReverseScans(md, lookupJoin, &required.Ordering) {
 				writeFlag("reverse-scans")
 			}
+			if !lookupJoin.ParameterizedCols.Empty() {
+				tp.Childf("parameterized columns: %s", lookupJoin.ParameterizedCols)
+			}
 		}
 
 		if f.Buffer.Len() != 0 {
 
@@ -2,10 +2,15 @@ exec-ddl
 CREATE TABLE t (
   k INT PRIMARY KEY,
   i INT,
-  s STRING
+  s STRING,
+  INDEX (i)
 )
 ----
 
+# ------------------------
+# Tests without Histograms
+# ------------------------
+
 exec-ddl
 ALTER TABLE t INJECT STATISTICS '[
   {
@@ -166,3 +171,295 @@ select
  │    └── fd: (1)-->(2,3)
  └── filters
       └── (i:2 = $1) OR (s:3 = $2) [type=bool, outer=(2,3)]
+
+# ---------------------
+# Tests with Histograms
+# ---------------------
+
+exec-ddl
+ALTER TABLE t INJECT STATISTICS '[
+  {
+    "columns": ["k"],
+    "created_at": "2018-01-01 1:00:00.00000+00:00",
+    "row_count": 1000,
+    "distinct_count": 1000
+  },
+  {
+    "columns": ["i"],
+    "created_at": "2018-01-01 1:00:00.00000+00:00",
+    "row_count": 1000,
+    "distinct_count": 41,
+    "null_count": 30,
+    "avg_size": 2,
+    "histo_col_type": "int",
+    "histo_buckets": [
+      {"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "0"},
+      {"num_eq": 10, "num_range": 90, "distinct_range": 9, "upper_bound": "100"},
+      {"num_eq": 10, "num_range": 180, "distinct_range": 9, "upper_bound": "200"},
+      {"num_eq": 20, "num_range": 270, "distinct_range": 9, "upper_bound": "300"},
+      {"num_eq": 30, "num_range": 360, "distinct_range": 9, "upper_bound": "400"}
+    ]
+  },
+  {
+    "columns": ["s"],
+    "created_at": "2018-01-01 1:00:00.00000+00:00",
+    "row_count": 1000,
+    "distinct_count": 20,
+    "avg_size": 3,
+    "histo_col_type": "string",
+    "histo_buckets": [
+      {"num_eq": 0, "num_range": 0, "distinct_range": 0, "upper_bound": "apple"},
+      {"num_eq": 300, "num_range": 100, "distinct_range": 9, "upper_bound": "banana"},
+      {"num_eq": 500, "num_range": 100, "distinct_range": 9, "upper_bound": "cherry"}
+    ]
+  }
+]'
+----
+
+norm
+SELECT * FROM t WHERE k = $1
+----
+select
+ ├── columns: k:1(int!null) i:2(int) s:3(string)
+ ├── cardinality: [0 - 1]
+ ├── has-placeholder
+ ├── stats: [rows=1, distinct(1)=1, null(1)=0]
+ ├── key: ()
+ ├── fd: ()-->(1-3)
+ ├── scan t
+ │    ├── columns: k:1(int!null) i:2(int) s:3(string)
+ │    ├── stats: [rows=1000, distinct(1)=1000, null(1)=0]
+ │    ├── key: (1)
+ │    └── fd: (1)-->(2,3)
+ └── filters
+      └── k:1 = $1 [type=bool, outer=(1), constraints=(/1: (/NULL - ]), fd=()-->(1)]
+
+# The row count of the filter is the max frequency of i's histogram.
+norm
+SELECT * FROM t WHERE i = $1
+----
+select
+ ├── columns: k:1(int!null) i:2(int!null) s:3(string)
+ ├── has-placeholder
+ ├── stats: [rows=30, distinct(2)=1, null(2)=0]
+ ├── key: (1)
+ ├── fd: ()-->(2), (1)-->(3)
+ ├── scan t
+ │    ├── columns: k:1(int!null) i:2(int) s:3(string)
+ │    ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(2)=41, null(2)=30]
+ │    │   histogram(2)=  0   30   0  0  90  10   180  10   270  20   360  30
+ │    │                <--- NULL --- 0 ---- 100 ----- 200 ----- 300 ----- 400
+ │    ├── key: (1)
+ │    └── fd: (1)-->(2,3)
+ └── filters
+      └── i:2 = $1 [type=bool, outer=(2), constraints=(/2: (/NULL - ]), fd=()-->(2)]
+
+# Similar case as above, but with opt to ensure the correct row counts are used
+# for new memo groups.
+opt
+SELECT k FROM t WHERE i = $1
+----
+project
+ ├── columns: k:1(int!null)
+ ├── has-placeholder
+ ├── stats: [rows=30]
+ ├── key: (1)
+ └── placeholder-scan t@t_i_idx
+      ├── columns: k:1(int!null) i:2(int!null)
+      ├── has-placeholder
+      ├── stats: [rows=30, distinct(2)=1, null(2)=0]
+      ├── key: (1)
+      ├── fd: ()-->(2)
+      └── span
+           └── $1 [type=int]
+
+# Similar case as above, but with opt to ensure the correct row counts are used
+# for new memo groups.
+opt
+SELECT * FROM t WHERE i = $1
+----
+project
+ ├── columns: k:1(int!null) i:2(int!null) s:3(string)
+ ├── has-placeholder
+ ├── stats: [rows=30, distinct(2)=1, null(2)=0]
+ ├── key: (1)
+ ├── fd: ()-->(2), (1)-->(3)
+ └── inner-join (lookup t)
+      ├── columns: k:1(int!null) i:2(int!null) s:3(string) "$1":6(int!null)
+      ├── key columns: [1] = [1]
+      ├── lookup columns are key
+      ├── has-placeholder
+      ├── stats: [rows=30, distinct(2)=1, null(2)=0, distinct(6)=1, null(6)=0]
+      ├── key: (1)
+      ├── fd: ()-->(2,6), (1)-->(3), (2)==(6), (6)==(2)
+      ├── inner-join (lookup t@t_i_idx)
+      │    ├── columns: k:1(int!null) i:2(int!null) "$1":6(int!null)
+      │    ├── flags: disallow merge join
+      │    ├── key columns: [6] = [2]
+      │    ├── parameterized columns: (6)
+      │    ├── has-placeholder
+      │    ├── stats: [rows=30, distinct(2)=1, null(2)=0, distinct(6)=1, null(6)=0]
+      │    ├── key: (1)
+      │    ├── fd: ()-->(2,6), (2)==(6), (6)==(2)
+      │    ├── values
+      │    │    ├── columns: "$1":6(int)
+      │    │    ├── cardinality: [1 - 1]
+      │    │    ├── has-placeholder
+      │    │    ├── stats: [rows=1, distinct(6)=1, null(6)=0]
+      │    │    ├── key: ()
+      │    │    ├── fd: ()-->(6)
+      │    │    └── ($1,) [type=tuple{int}]
+      │    └── filters (true)
+      └── filters (true)
+
+# The row count of the filter is the max frequency of s's histogram.
+norm
+SELECT * FROM t WHERE $1 = s
+----
+select
+ ├── columns: k:1(int!null) i:2(int) s:3(string!null)
+ ├── has-placeholder
+ ├── stats: [rows=500, distinct(3)=1, null(3)=0]
+ ├── key: (1)
+ ├── fd: ()-->(3), (1)-->(2)
+ ├── scan t
+ │    ├── columns: k:1(int!null) i:2(int) s:3(string)
+ │    ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(3)=20, null(3)=0]
+ │    │   histogram(3)=  0     0     100    300     100    500
+ │    │                <--- 'apple' ----- 'banana' ----- 'cherry'
+ │    ├── key: (1)
+ │    └── fd: (1)-->(2,3)
+ └── filters
+      └── s:3 = $1 [type=bool, outer=(3), constraints=(/3: (/NULL - ]), fd=()-->(3)]
+
+# Similar case to the previous one, but with a join on a values expression to
+# mimic a parameterized join of a generic query plan.
+# TODO(mgartner): The row count of the inner-join should be 500, because that is
+# the maximum frequency of s. It is currently 50 because the v.s is not marked
+# as a "parameterized column", which only happens during the
+# GenerateParameterizedJoin exploration rule. I think we can address this by
+# including paramterized columns in logical properties and propagating them
+# upward.
+norm
+SELECT * FROM (VALUES ($1::STRING)) v(s) JOIN t ON t.s = v.s
+----
+inner-join (hash)
+ ├── columns: s:1(string!null) k:2(int!null) i:3(int) s:4(string!null)
+ ├── multiplicity: left-rows(zero-or-more), right-rows(zero-or-one)
+ ├── has-placeholder
+ ├── stats: [rows=50, distinct(1)=1, null(1)=0, distinct(4)=1, null(4)=0]
+ ├── key: (2)
+ ├── fd: ()-->(1,4), (2)-->(3), (1)==(4), (4)==(1)
+ ├── values
+ │    ├── columns: column1:1(string)
+ │    ├── cardinality: [1 - 1]
+ │    ├── has-placeholder
+ │    ├── stats: [rows=1, distinct(1)=1, null(1)=0]
+ │    ├── key: ()
+ │    ├── fd: ()-->(1)
+ │    └── ($1,) [type=tuple{string}]
+ ├── scan t
+ │    ├── columns: k:2(int!null) i:3(int) s:4(string)
+ │    ├── stats: [rows=1000, distinct(4)=20, null(4)=0]
+ │    │   histogram(4)=  0     0     100    300     100    500
+ │    │                <--- 'apple' ----- 'banana' ----- 'cherry'
+ │    ├── key: (2)
+ │    └── fd: (2)-->(3,4)
+ └── filters
+      └── s:4 = column1:1 [type=bool, outer=(1,4), constraints=(/1: (/NULL - ]; /4: (/NULL - ]), fd=(1)==(4), (4)==(1)]
+
+# The row count of the filter is based on the product of selectivities from the
+# max frequencies of i's and s's histograms.
+norm
+SELECT * FROM t WHERE i = $1 AND s = $2
+----
+select
+ ├── columns: k:1(int!null) i:2(int!null) s:3(string!null)
+ ├── has-placeholder
+ ├── stats: [rows=15, distinct(2)=1, null(2)=0, distinct(3)=1, null(3)=0, distinct(2,3)=1, null(2,3)=0]
+ ├── key: (1)
+ ├── fd: ()-->(2,3)
+ ├── scan t
+ │    ├── columns: k:1(int!null) i:2(int) s:3(string)
+ │    ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(2)=41, null(2)=30, distinct(3)=20, null(3)=0, distinct(2,3)=820, null(2,3)=0]
+ │    │   histogram(2)=  0   30   0  0  90  10   180  10   270  20   360  30
+ │    │                <--- NULL --- 0 ---- 100 ----- 200 ----- 300 ----- 400
+ │    │   histogram(3)=  0     0     100    300     100    500
+ │    │                <--- 'apple' ----- 'banana' ----- 'cherry'
+ │    ├── key: (1)
+ │    └── fd: (1)-->(2,3)
+ └── filters
+      ├── i:2 = $1 [type=bool, outer=(2), constraints=(/2: (/NULL - ]), fd=()-->(2)]
+      └── s:3 = $2 [type=bool, outer=(3), constraints=(/3: (/NULL - ]), fd=()-->(3)]
+
+norm
+SELECT * FROM t WHERE i > $1
+----
+select
+ ├── columns: k:1(int!null) i:2(int!null) s:3(string)
+ ├── has-placeholder
+ ├── stats: [rows=323.333, distinct(2)=41, null(2)=0]
+ ├── key: (1)
+ ├── fd: (1)-->(2,3)
+ ├── scan t
+ │    ├── columns: k:1(int!null) i:2(int) s:3(string)
+ │    ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(2)=41, null(2)=30]
+ │    │   histogram(2)=  0   30   0  0  90  10   180  10   270  20   360  30
+ │    │                <--- NULL --- 0 ---- 100 ----- 200 ----- 300 ----- 400
+ │    ├── key: (1)
+ │    └── fd: (1)-->(2,3)
+ └── filters
+      └── i:2 > $1 [type=bool, outer=(2), constraints=(/2: (/NULL - ])]
+
+norm
+SELECT * FROM t WHERE i = $1 OR i = $2
+----
+select
+ ├── columns: k:1(int!null) i:2(int!null) s:3(string)
+ ├── has-placeholder
+ ├── stats: [rows=323.333, distinct(2)=41, null(2)=0]
+ ├── key: (1)
+ ├── fd: (1)-->(2,3)
+ ├── scan t
+ │    ├── columns: k:1(int!null) i:2(int) s:3(string)
+ │    ├── stats: [rows=1000, distinct(1)=1000, null(1)=0, distinct(2)=41, null(2)=30]
+ │    │   histogram(2)=  0   30   0  0  90  10   180  10   270  20   360  30
+ │    │                <--- NULL --- 0 ---- 100 ----- 200 ----- 300 ----- 400
+ │    ├── key: (1)
+ │    └── fd: (1)-->(2,3)
+ └── filters
+      └── (i:2 = $1) OR (i:2 = $2) [type=bool, outer=(2), constraints=(/2: (/NULL - ])]
+
+norm
+SELECT * FROM t WHERE i IN ($1, $2, $3)
+----
+select
+ ├── columns: k:1(int!null) i:2(int) s:3(string)
+ ├── has-placeholder
+ ├── stats: [rows=333.333]
+ ├── key: (1)
+ ├── fd: (1)-->(2,3)
+ ├── scan t
+ │    ├── columns: k:1(int!null) i:2(int) s:3(string)
+ │    ├── stats: [rows=1000, distinct(1)=1000, null(1)=0]
+ │    ├── key: (1)
+ │    └── fd: (1)-->(2,3)
+ └── filters
+      └── i:2 IN ($1, $2, $3) [type=bool, outer=(2)]
+
+norm
+SELECT * FROM t WHERE i = $1 OR s = $2
+----
+select
+ ├── columns: k:1(int!null) i:2(int) s:3(string)
+ ├── has-placeholder
+ ├── stats: [rows=333.333]
+ ├── key: (1)
+ ├── fd: (1)-->(2,3)
+ ├── scan t
+ │    ├── columns: k:1(int!null) i:2(int) s:3(string)
+ │    ├── stats: [rows=1000, distinct(1)=1000, null(1)=0]
+ │    ├── key: (1)
+ │    └── fd: (1)-->(2,3)
+ └── filters
+      └── (i:2 = $1) OR (s:3 = $2) [type=bool, outer=(2,3)]
@@ -307,6 +307,11 @@ define JoinPrivate {
     # SkipReorderJoins indicates whether the ReorderJoins rule should match this
     # join.
     SkipReorderJoins bool
+
+    # ParameterizedCols is the set of columns that are equivalent to placeholder
+    # values. These columns are typically created when exploring parameterized
+    # joins for generic query plans.
+    ParameterizedCols ColSet
 }
 
 # IndexJoin represents an inner join between an input expression and a primary
 
@@ -89,10 +89,16 @@ func (s *Selectivity) Divide(other Selectivity) {
 
 // MinSelectivity returns the smaller value of two selectivities.
 func MinSelectivity(a, b Selectivity) Selectivity {
-	if a.selectivity < b.selectivity {
-		return a
+	return Selectivity{
+		selectivity: min(a.selectivity, b.selectivity),
+	}
+}
+
+// MinSelectivity3 returns the smallest value of three selectivities.
+func MinSelectivity3(a, b, c Selectivity) Selectivity {
+	return Selectivity{
+		selectivity: min(a.selectivity, b.selectivity, c.selectivity),
 	}
-	return b
 }
 
 // MaxSelectivity returns the larger value of two selectivities.