Skip to content

Commit 6fb4dd8

Browse files
committed
WIP.
1 parent 8985d0c commit 6fb4dd8

File tree

24 files changed

+668
-27
lines changed

24 files changed

+668
-27
lines changed

internal/execution/queries/plan/access.go

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"github.com/efritz/gostgres/internal/execution/expressions"
66
"github.com/efritz/gostgres/internal/execution/queries/nodes"
77
"github.com/efritz/gostgres/internal/execution/queries/nodes/access"
8+
"github.com/efritz/gostgres/internal/execution/queries/plan/cost"
89
"github.com/efritz/gostgres/internal/shared/fields"
910
"github.com/efritz/gostgres/internal/shared/impls"
1011
)
@@ -59,8 +60,8 @@ func (n *logicalAccessNode) Optimize(ctx impls.OptimizationContext) {
5960
n.order = nil
6061
}
6162

62-
func (n *logicalAccessNode) EstimateCost() Cost {
63-
return Cost{} // TODO
63+
func (n *logicalAccessNode) EstimateCost() impls.NodeCost {
64+
return cost.ApplyFilterToCost(n.strategy.EstimateCost(), n.filter)
6465
}
6566

6667
func (n *logicalAccessNode) Filter() impls.Expression {
@@ -162,6 +163,17 @@ func (s *logicalTableAccessStrategy) Ordering() impls.OrderExpression {
162163
return nil
163164
}
164165

166+
var tableAccessCostPerRow = impls.ResourceCost{CPU: 0.01, IO: 0.1}
167+
168+
func (s *logicalTableAccessStrategy) EstimateCost() impls.NodeCost {
169+
stats := s.table.Statistics()
170+
171+
return impls.NodeCost{
172+
VariableCost: tableAccessCostPerRow.ScaleUniform(float64(stats.RowCount)),
173+
Statistics: stats,
174+
}
175+
}
176+
165177
func (s *logicalTableAccessStrategy) Build() nodes.AccessStrategy {
166178
return access.NewTableAccessStrategy(s.table)
167179
}
@@ -186,6 +198,28 @@ func (s *logicalIndexAccessStrategy[O]) Filter() impls.Expression {
186198
return expressions.UnionFilters(append(expressions.Conjunctions(filterExpression), expressions.Conjunctions(condition)...)...)
187199
}
188200

201+
var indexAccessCostPerRow = impls.ResourceCost{CPU: 0.01, IO: 0.1}
202+
203+
func (s *logicalIndexAccessStrategy[O]) EstimateCost() impls.NodeCost {
204+
tableStats := s.table.Statistics()
205+
indexStats := s.index.Statistics()
206+
207+
// TODO - remove this use
208+
selectivity := cost.EstimateFilterSelectivity(s.index.Condition(s.opts), impls.RelationStatistics{
209+
RowCount: indexStats.RowCount,
210+
ColumnStatistics: tableStats.ColumnStatistics,
211+
})
212+
estimatedRows := float64(indexStats.RowCount) * selectivity
213+
214+
return impls.NodeCost{
215+
VariableCost: indexAccessCostPerRow.ScaleUniform(estimatedRows),
216+
Statistics: impls.RelationStatistics{
217+
RowCount: int(estimatedRows),
218+
ColumnStatistics: tableStats.ColumnStatistics, // TODO - update based on filter, condition
219+
},
220+
}
221+
}
222+
189223
func (s *logicalIndexAccessStrategy[O]) Build() nodes.AccessStrategy {
190224
return access.NewIndexAccessStrategy(s.table, s.index, s.opts, s.Filter())
191225
}

internal/execution/queries/plan/analyze.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ func (n *logicalAnalyze) Fields() []fields.Field
2121
func (n *logicalAnalyze) AddFilter(ctx impls.OptimizationContext, filter impls.Expression) {} // top-level
2222
func (n *logicalAnalyze) AddOrder(ctx impls.OptimizationContext, order impls.OrderExpression) {} // top-level
2323
func (n *logicalAnalyze) Optimize(ctx impls.OptimizationContext) {}
24-
func (n *logicalAnalyze) EstimateCost() Cost { return Cost{} }
24+
func (n *logicalAnalyze) EstimateCost() impls.NodeCost { return impls.NodeCost{} }
2525
func (n *logicalAnalyze) Filter() impls.Expression { return nil }
2626
func (n *logicalAnalyze) Ordering() impls.OrderExpression { return nil }
2727
func (n *logicalAnalyze) SupportsMarkRestore() bool { return false }
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
package cost
2+
3+
import (
4+
"github.com/efritz/gostgres/internal/execution/expressions"
5+
"github.com/efritz/gostgres/internal/shared/impls"
6+
"github.com/efritz/gostgres/internal/shared/ordering"
7+
"github.com/efritz/gostgres/internal/shared/rows"
8+
)
9+
10+
const defaultSelectivity = 0.7 // TODO
11+
12+
func estimateExpressionSelectivity(cond impls.Expression, rowCount int, columnStatistics []impls.ColumnStatistics) (s float64) {
13+
// fmt.Printf("ESTIMATING %s -> \n", cond)
14+
// defer func() { fmt.Printf("ESTIMATING %s -> %.12f\n", cond, s) }()
15+
16+
if cond == nil {
17+
return 1.0
18+
}
19+
20+
if expressions.IsConjunction(cond) {
21+
selectivity := 1.0
22+
for _, conjunction := range expressions.Conjunctions(cond) {
23+
selectivity *= estimateExpressionSelectivity(conjunction, rowCount, columnStatistics)
24+
}
25+
26+
return selectivity
27+
}
28+
29+
if expressions.IsDisjunction(cond) {
30+
selectivity := 1.0
31+
for _, disjunction := range expressions.Disjunctions(cond) {
32+
selectivity *= 1 - estimateExpressionSelectivity(disjunction, rowCount, columnStatistics)
33+
}
34+
35+
return 1 - selectivity
36+
}
37+
38+
if inner, ok := expressions.IsNegation(cond); ok {
39+
return 1 - estimateExpressionSelectivity(inner, rowCount, columnStatistics)
40+
}
41+
42+
comparisonType, left, right := expressions.IsComparison(cond)
43+
if comparisonType == expressions.ComparisonTypeUnknown {
44+
return defaultSelectivity
45+
}
46+
47+
if _, ok := findStatisticsForColumn(columnStatistics, right); ok {
48+
left, right = right, left
49+
comparisonType = comparisonType.Flip()
50+
}
51+
52+
stats, ok := findStatisticsForColumn(columnStatistics, left)
53+
if !ok {
54+
return defaultSelectivity
55+
}
56+
57+
value, err := right.ValueFrom(impls.EmptyExecutionContext, rows.Row{})
58+
if err != nil {
59+
return defaultSelectivity
60+
}
61+
62+
switch comparisonType {
63+
case expressions.ComparisonTypeEquals:
64+
return estimateEqualitySelectivity(stats, rowCount, value)
65+
66+
case
67+
expressions.ComparisonTypeLessThan,
68+
expressions.ComparisonTypeLessThanEquals,
69+
expressions.ComparisonTypeGreaterThan,
70+
expressions.ComparisonTypeGreaterThanEquals:
71+
return estimateComparisonSelectivity(stats, rowCount, comparisonType, value)
72+
}
73+
74+
return defaultSelectivity
75+
}
76+
77+
func findStatisticsForColumn(columnStatistics []impls.ColumnStatistics, expr impls.Expression) (impls.ColumnStatistics, bool) {
78+
for _, columnStats := range columnStatistics {
79+
// TODO - better way to determine field
80+
if expr.Equal(expressions.NewNamed(columnStats.Field)) {
81+
return columnStats, true
82+
}
83+
}
84+
85+
return impls.ColumnStatistics{}, false
86+
}
87+
88+
func estimateEqualitySelectivity(columnStatistics impls.ColumnStatistics, rowCount int, value any) float64 {
89+
nonNullFraction := 1 - columnStatistics.NullFraction
90+
91+
for _, mcv := range columnStatistics.MostCommonValues {
92+
if ordering.CompareValues(mcv.Value, value) == ordering.OrderTypeEqual {
93+
return mcv.Frequency * nonNullFraction
94+
}
95+
}
96+
97+
remainingFrequency := 1.0
98+
for _, mcv := range columnStatistics.MostCommonValues {
99+
remainingFrequency -= mcv.Frequency
100+
}
101+
102+
distinctCount := columnStatistics.DistinctFraction * float64(rowCount)
103+
remainingDistinctCount := distinctCount - float64(len(columnStatistics.MostCommonValues))
104+
if remainingDistinctCount == 0 {
105+
return 0
106+
}
107+
108+
return (remainingFrequency / remainingDistinctCount) * nonNullFraction
109+
}
110+
111+
func estimateComparisonSelectivity(columnStatistics impls.ColumnStatistics, rowCount int, comparisonType expressions.ComparisonType, value any) float64 {
112+
bucketIndex := len(columnStatistics.HistogramBounds)
113+
for i, bound := range columnStatistics.HistogramBounds {
114+
if cmp := ordering.CompareValues(value, bound); cmp == ordering.OrderTypeBefore || cmp == ordering.OrderTypeEqual {
115+
bucketIndex = i
116+
break
117+
}
118+
}
119+
120+
cumulativeHistogramFrequency := float64(bucketIndex) / float64(len(columnStatistics.HistogramBounds))
121+
if comparisonType == expressions.ComparisonTypeGreaterThan || comparisonType == expressions.ComparisonTypeGreaterThanEquals {
122+
cumulativeHistogramFrequency = 1 - cumulativeHistogramFrequency
123+
}
124+
125+
mcvAdjustment := 0.0
126+
for _, mcv := range columnStatistics.MostCommonValues {
127+
if matchOrderWithComparisonType(mcv.Value, value, comparisonType) { // TODO - is this order correct?
128+
mcvAdjustment += mcv.Frequency
129+
}
130+
}
131+
132+
nonNullFraction := 1 - columnStatistics.NullFraction
133+
return (cumulativeHistogramFrequency + mcvAdjustment) * nonNullFraction
134+
}
135+
136+
func matchOrderWithComparisonType(left, right any, comparisonType expressions.ComparisonType) bool {
137+
switch ordering.CompareValues(left, right) {
138+
case ordering.OrderTypeBefore:
139+
return comparisonType == expressions.ComparisonTypeLessThan || comparisonType == expressions.ComparisonTypeLessThanEquals
140+
141+
case ordering.OrderTypeEqual:
142+
return comparisonType == expressions.ComparisonTypeLessThanEquals || comparisonType == expressions.ComparisonTypeGreaterThanEquals
143+
144+
case ordering.OrderTypeAfter:
145+
return comparisonType == expressions.ComparisonTypeGreaterThan || comparisonType == expressions.ComparisonTypeGreaterThanEquals
146+
}
147+
148+
return false
149+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package cost
2+
3+
import "github.com/efritz/gostgres/internal/shared/impls"
4+
5+
var (
6+
buildAggregateTableCostPerRow = impls.ResourceCost{CPU: 0.1}
7+
buildAggregateTableCostPerBucket = impls.ResourceCost{Memory: 0.1}
8+
)
9+
10+
func ApplyAggregationToCost(innerCost impls.NodeCost) impls.NodeCost {
11+
// TODO - will need additional information, group expressions, etc
12+
estimatedNumBuckets := EstimateDistinctCount(innerCost.Statistics.RowCount)
13+
14+
// Aggregation reads the entire inner relation on startup
15+
cost := MaterializeCost(innerCost)
16+
17+
// Add the cost of creating aggregation buckets and hashing all rows from the inner relation
18+
n := float64(innerCost.Statistics.RowCount)
19+
cost.FixedCost = cost.FixedCost.Add(buildAggregateTableCostPerRow.ScaleUniform(n))
20+
cost.FixedCost = cost.FixedCost.Add(buildAggregateTableCostPerBucket.ScaleUniform(float64(estimatedNumBuckets)))
21+
22+
// One row is projected for each bucket
23+
cost.Statistics.RowCount = estimatedNumBuckets
24+
cost = ApplyProjectionToCost(cost)
25+
26+
// TODO - update statistics
27+
return cost
28+
}
29+
30+
func EstimateDistinctCount(estimatedRows int) int {
31+
return estimatedRows // TODO
32+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package cost
2+
3+
import "github.com/efritz/gostgres/internal/shared/impls"
4+
5+
var filterEvaluationCostPerRow = impls.ResourceCost{CPU: 0.01}
6+
7+
func ApplyFilterToCost(innerCost impls.NodeCost, filter impls.Expression) impls.NodeCost {
8+
cost := innerCost
9+
10+
// Evaluate a filter expression for every row
11+
n := float64(innerCost.Statistics.RowCount)
12+
cost.VariableCost = cost.VariableCost.Add(filterEvaluationCostPerRow.ScaleUniform(n))
13+
14+
// Only rows selected by the filter are emitted
15+
selectivity := EstimateFilterSelectivity(filter, innerCost.Statistics)
16+
cost.Statistics.RowCount = int(n * selectivity)
17+
18+
// TODO - update statistics
19+
return cost
20+
}
21+
22+
// TODO - reduce uses outside of ApplyFilterToCost
23+
func EstimateFilterSelectivity(filter impls.Expression, statistics impls.RelationStatistics) float64 {
24+
// TODO - collapse
25+
return estimateExpressionSelectivity(filter, statistics.RowCount, statistics.ColumnStatistics)
26+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package cost
2+
3+
func coalesce(value *int, defaultValue int) int {
4+
if value != nil {
5+
return *value
6+
}
7+
8+
return defaultValue
9+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package cost
2+
3+
import "github.com/efritz/gostgres/internal/shared/impls"
4+
5+
var (
6+
joinMergeCostPerRow = impls.ResourceCost{CPU: 0.2}
7+
joinFilterCostPerRow = impls.ResourceCost{CPU: 0.1}
8+
)
9+
10+
func EstimateNestedLoopJoinCost(
11+
leftCost impls.NodeCost,
12+
rightCost impls.NodeCost,
13+
joinSelectivity float64,
14+
hasCondition bool,
15+
) impls.NodeCost {
16+
estimatedLeftRows := float64(leftCost.Statistics.RowCount)
17+
estimatedRightRows := float64(rightCost.Statistics.RowCount)
18+
estimatedCandidateRows := estimatedLeftRows * estimatedRightRows
19+
estimatedResultRows := estimatedCandidateRows * joinSelectivity
20+
21+
costPerCandidateRow := joinMergeCostPerRow
22+
if hasCondition {
23+
costPerCandidateRow = costPerCandidateRow.Add(joinFilterCostPerRow)
24+
}
25+
26+
// On startup, we only initialize the left scanner
27+
fixedCost := leftCost.FixedCost
28+
29+
variableCost := impls.SumCosts(
30+
leftCost.VariableCost.ScaleUniform(estimatedLeftRows), // Cost to scan each row from left relation
31+
rightCost.FixedCost.ScaleUniform(estimatedLeftRows), // Cost to re-initialized right scanner for every row from left relation
32+
rightCost.VariableCost.ScaleUniform(estimatedCandidateRows), // Cost to scan each row from right relation
33+
costPerCandidateRow.ScaleUniform(estimatedCandidateRows), // Cost to merge row pairs and evaluate the join condition
34+
)
35+
36+
// TODO - update statistics
37+
return impls.NodeCost{
38+
FixedCost: fixedCost,
39+
VariableCost: variableCost,
40+
Statistics: impls.RelationStatistics{
41+
RowCount: int(estimatedResultRows),
42+
ColumnStatistics: nil, // TODO - implement
43+
},
44+
}
45+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package cost
2+
3+
import "github.com/efritz/gostgres/internal/shared/impls"
4+
5+
func AlterCostByLimitOffset(innerCost impls.NodeCost, limit, offset *int) impls.NodeCost {
6+
if limit == nil && offset == nil {
7+
return innerCost
8+
}
9+
10+
cost := innerCost
11+
12+
// Scale the variable cost of the inner relation by reading only limit + offset rows
13+
o := coalesce(offset, 0)
14+
l := coalesce(limit, innerCost.Statistics.RowCount-o)
15+
cost.VariableCost = cost.VariableCost.ScaleUniform(float64(l+o) / float64(innerCost.Statistics.RowCount))
16+
17+
// Adjust number of output rows; this may be less than limit + offset, so we may end up
18+
// "smearing" the variable cost of the inner relation over fewer rows of the outer relation.
19+
cost.Statistics.RowCount = l
20+
21+
// TODO - update statistics
22+
return cost
23+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package cost
2+
3+
import "github.com/efritz/gostgres/internal/shared/impls"
4+
5+
func MaterializeCost(innerCost impls.NodeCost) impls.NodeCost {
6+
cost := innerCost
7+
8+
// We pay the variable cost of the inner relation at startup
9+
cost.FixedCost = cost.FixedCost.Add(innerCost.VariableCost)
10+
11+
// Reset the varaible cost so we're not double counting it
12+
cost.VariableCost = impls.ResourceCost{}
13+
14+
return cost
15+
}

0 commit comments

Comments
 (0)