Skip to content

Commit 5cd121f

Browse files
Dousir9everpcpc
andauthored
chore(backport): improve histogram cardinality estimation (#17200) (#17210)
chore(planner): improve histogram cardinality estimation (#17200) * chore(planner): improve histogram cardinality estimation * chore(test): update sqllogictest * chore(test): add sqllogictest * chore(test): update sqllogictest * chore(test): add fuzz test * chore(test): add physical plan info * chore(test): update fuzz test * chore(test): refine test Co-authored-by: everpcpc <[email protected]>
1 parent 8faad84 commit 5cd121f

File tree

19 files changed

+2135
-88
lines changed

19 files changed

+2135
-88
lines changed

src/common/storage/src/statistics.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,14 @@ impl Datum {
5959
matches!(self, Datum::Bytes(_))
6060
}
6161

62+
pub fn to_float(self) -> Self {
63+
match self {
64+
Datum::Int(v) => Datum::Float(F64::from(v as f64)),
65+
Datum::UInt(v) => Datum::Float(F64::from(v as f64)),
66+
_ => self,
67+
}
68+
}
69+
6270
pub fn to_double(&self) -> Result<f64> {
6371
match self {
6472
Datum::Bool(v) => Ok(*v as u8 as f64),

src/query/service/src/interpreters/interpreter_explain.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ impl Interpreter for ExplainInterpreter {
147147
self.ctx.clone(),
148148
*s_expr.clone(),
149149
schema.clone(),
150+
metadata.clone(),
150151
)?;
151152
let plan = interpreter.build_physical_plan(&mutation, None).await?;
152153
self.explain_physical_plan(&plan, metadata, &None).await?
@@ -529,7 +530,12 @@ impl ExplainInterpreter {
529530
schema: DataSchemaRef,
530531
) -> Result<Vec<DataBlock>> {
531532
let mutation: Mutation = s_expr.plan().clone().try_into()?;
532-
let interpreter = MutationInterpreter::try_create(self.ctx.clone(), s_expr, schema)?;
533+
let interpreter = MutationInterpreter::try_create(
534+
self.ctx.clone(),
535+
s_expr,
536+
schema,
537+
mutation.metadata.clone(),
538+
)?;
533539
let plan = interpreter.build_physical_plan(&mutation, None).await?;
534540
let root_fragment = Fragmenter::try_create(self.ctx.clone())?.build_fragment(&plan)?;
535541

src/query/service/src/interpreters/interpreter_factory.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use databend_common_catalog::lock::LockTableOption;
1919
use databend_common_exception::ErrorCode;
2020
use databend_common_exception::Result;
2121
use databend_common_sql::binder::ExplainConfig;
22+
use databend_common_sql::plans::Mutation;
2223
use log::error;
2324

2425
use super::interpreter_catalog_create::CreateCatalogInterpreter;
@@ -384,9 +385,15 @@ impl InterpreterFactory {
384385
Plan::Insert(insert) => InsertInterpreter::try_create(ctx, *insert.clone()),
385386

386387
Plan::Replace(replace) => ReplaceInterpreter::try_create(ctx, *replace.clone()),
387-
Plan::DataMutation { s_expr, schema, .. } => Ok(Arc::new(
388-
MutationInterpreter::try_create(ctx, *s_expr.clone(), schema.clone())?,
389-
)),
388+
Plan::DataMutation { s_expr, schema, .. } => {
389+
let mutation: Mutation = s_expr.plan().clone().try_into()?;
390+
Ok(Arc::new(MutationInterpreter::try_create(
391+
ctx,
392+
*s_expr.clone(),
393+
schema.clone(),
394+
mutation.metadata.clone(),
395+
)?))
396+
}
390397

391398
// Roles
392399
Plan::CreateRole(create_role) => Ok(Arc::new(CreateRoleInterpreter::try_create(

src/query/service/src/interpreters/interpreter_mutation.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ use databend_common_sql::executor::MutationBuildInfo;
3636
use databend_common_sql::executor::PhysicalPlan;
3737
use databend_common_sql::executor::PhysicalPlanBuilder;
3838
use databend_common_sql::optimizer::SExpr;
39+
use databend_common_sql::planner::MetadataRef;
3940
use databend_common_sql::plans;
4041
use databend_common_sql::plans::Mutation;
4142
use databend_common_storage::MutationStatus;
@@ -44,6 +45,7 @@ use databend_common_storages_fuse::operations::TruncateMode;
4445
use databend_common_storages_fuse::FuseTable;
4546
use databend_common_storages_fuse::TableContext;
4647
use databend_storages_common_table_meta::meta::TableSnapshot;
48+
use log::info;
4749

4850
use crate::interpreters::common::check_deduplicate_label;
4951
use crate::interpreters::common::dml_build_update_stream_req;
@@ -58,18 +60,21 @@ pub struct MutationInterpreter {
5860
ctx: Arc<QueryContext>,
5961
s_expr: SExpr,
6062
schema: DataSchemaRef,
63+
metadata: MetadataRef,
6164
}
6265

6366
impl MutationInterpreter {
6467
pub fn try_create(
6568
ctx: Arc<QueryContext>,
6669
s_expr: SExpr,
6770
schema: DataSchemaRef,
71+
metadata: MetadataRef,
6872
) -> Result<MutationInterpreter> {
6973
Ok(MutationInterpreter {
7074
ctx,
7175
s_expr,
7276
schema,
77+
metadata,
7378
})
7479
}
7580
}
@@ -129,6 +134,12 @@ impl Interpreter for MutationInterpreter {
129134
.build_physical_plan(&mutation, Some(mutation_build_info))
130135
.await?;
131136

137+
let query_plan = physical_plan
138+
.format(self.metadata.clone(), Default::default())?
139+
.format_pretty()?;
140+
141+
info!("Query physical plan: \n{}", query_plan);
142+
132143
// Build pipeline.
133144
let mut build_res =
134145
build_query_pipeline_without_render_result_set(&self.ctx, &physical_plan).await?;

src/query/service/src/interpreters/interpreter_table_add_column.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use databend_common_meta_types::MatchSeq;
2727
use databend_common_sql::field_default_value;
2828
use databend_common_sql::plans::AddColumnOption;
2929
use databend_common_sql::plans::AddTableColumnPlan;
30+
use databend_common_sql::plans::Mutation;
3031
use databend_common_sql::plans::Plan;
3132
use databend_common_sql::Planner;
3233
use databend_common_storages_fuse::FuseTable;
@@ -135,8 +136,13 @@ impl Interpreter for AddTableColumnInterpreter {
135136
let mut planner = Planner::new(self.ctx.clone());
136137
let (plan, _) = planner.plan_sql(&query).await?;
137138
if let Plan::DataMutation { s_expr, schema, .. } = plan {
138-
let interpreter =
139-
MutationInterpreter::try_create(self.ctx.clone(), *s_expr, schema)?;
139+
let mutation: Mutation = s_expr.plan().clone().try_into()?;
140+
let interpreter = MutationInterpreter::try_create(
141+
self.ctx.clone(),
142+
*s_expr,
143+
schema,
144+
mutation.metadata.clone(),
145+
)?;
140146
let _ = interpreter.execute(self.ctx.clone()).await?;
141147
return Ok(PipelineBuildResult::create());
142148
}

src/query/service/src/test_kits/fuse.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use databend_common_expression::DataSchemaRef;
2727
use databend_common_expression::ScalarRef;
2828
use databend_common_expression::SendableDataBlockStream;
2929
use databend_common_sql::optimizer::SExpr;
30+
use databend_common_sql::plans::Mutation;
3031
use databend_common_storages_factory::Table;
3132
use databend_common_storages_fuse::io::MetaWriter;
3233
use databend_common_storages_fuse::statistics::gen_columns_statistics;
@@ -286,7 +287,9 @@ pub async fn do_mutation(
286287
s_expr: SExpr,
287288
schema: DataSchemaRef,
288289
) -> Result<()> {
289-
let interpreter = MutationInterpreter::try_create(ctx.clone(), s_expr, schema)?;
290+
let mutation: Mutation = s_expr.plan().clone().try_into()?;
291+
let interpreter =
292+
MutationInterpreter::try_create(ctx.clone(), s_expr, schema, mutation.metadata.clone())?;
290293
let _ = interpreter.execute(ctx).await?;
291294
Ok(())
292295
}

src/query/sql/src/planner/optimizer/property/histogram.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,15 @@
1414

1515
use std::cmp::Ordering;
1616

17+
use databend_common_base::base::OrderedFloat;
1718
use databend_common_exception::Result;
1819
use databend_common_expression::arithmetics_type::ResultTypeOfUnary;
1920
use databend_common_storage::Datum;
2021
use databend_common_storage::Histogram;
2122
use databend_common_storage::HistogramBucket;
2223

24+
pub type F64 = OrderedFloat<f64>;
25+
2326
/// Construct a histogram from NDV and total number of rows.
2427
///
2528
/// # Arguments
@@ -59,7 +62,7 @@ pub fn histogram_from_ndv(
5962
}
6063

6164
let (min, max) = match bound {
62-
Some((min, max)) => (min, max),
65+
Some((min, max)) => (min.to_float(), max.to_float()),
6366
None => {
6467
return Err(format!(
6568
"Must have min and max value when NDV is greater than 0, got NDV: {}",
@@ -182,7 +185,7 @@ impl SampleSet for UniformSampleSet {
182185

183186
(Datum::Float(min), Datum::Float(max)) => {
184187
let min = *min;
185-
let max = *max;
188+
let max = (*max).checked_add(F64::from(1.0)).ok_or("overflowed")?;
186189
// TODO(xudong): better histogram computation.
187190
let bucket_range = max.checked_sub(min).ok_or("overflowed")? / num_buckets as f64;
188191
let upper_bound = min + bucket_range * bucket_index as f64;

tests/sqllogictests/suites/mode/cluster/memo/aggregate_property.test

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -63,22 +63,22 @@ Memo
6363
│ └── #0 Join [#0, #3]
6464
├── Group #5
6565
│ ├── Best properties
66-
│ │ ├── { dist: Any }: expr: #0, cost: 4419.000, children: [{ dist: Any }]
67-
│ │ └── { dist: Serial }: expr: #1, cost: 7569.000, children: [{ dist: Any }]
66+
│ │ ├── { dist: Any }: expr: #0, cost: 4420.000, children: [{ dist: Any }]
67+
│ │ └── { dist: Serial }: expr: #1, cost: 7920.000, children: [{ dist: Any }]
6868
│ ├── #0 EvalScalar [#4]
6969
│ └── #1 Exchange: (Merge) [#5]
7070
├── Group #6
7171
│ ├── Best properties
72-
│ │ └── { dist: Serial }: expr: #0, cost: 7614.000, children: [{ dist: Serial }]
72+
│ │ └── { dist: Serial }: expr: #0, cost: 7970.000, children: [{ dist: Serial }]
7373
│ ├── #0 Aggregate [#5]
7474
│ └── #1 Exchange: (Merge) [#6]
7575
├── Group #7
7676
│ ├── Best properties
77-
│ │ └── { dist: Any }: expr: #0, cost: 7619.000, children: [{ dist: Serial }]
77+
│ │ └── { dist: Any }: expr: #0, cost: 7975.000, children: [{ dist: Serial }]
7878
│ └── #0 Aggregate [#6]
7979
└── Group #8
8080
├── Best properties
81-
│ └── { dist: Serial }: expr: #0, cost: 7620.000, children: [{ dist: Any }]
81+
│ └── { dist: Serial }: expr: #0, cost: 7976.000, children: [{ dist: Any }]
8282
└── #0 EvalScalar [#7]
8383

8484
query T
@@ -126,22 +126,22 @@ Memo
126126
│ └── #0 Join [#0, #3]
127127
├── Group #5
128128
│ ├── Best properties
129-
│ │ ├── { dist: Any }: expr: #0, cost: 4419.000, children: [{ dist: Any }]
130-
│ │ └── { dist: Hash(t_10.a (#0)::Int32 NULL) }: expr: #1, cost: 4878.000, children: [{ dist: Any }]
129+
│ │ ├── { dist: Any }: expr: #0, cost: 4420.000, children: [{ dist: Any }]
130+
│ │ └── { dist: Hash(t_10.a (#0)::Int32 NULL) }: expr: #1, cost: 4930.000, children: [{ dist: Any }]
131131
│ ├── #0 EvalScalar [#4]
132132
│ └── #1 Exchange: (Hash(t_10.a (#0)::Int32 NULL)) [#5]
133133
├── Group #6
134134
│ ├── Best properties
135-
│ │ └── { dist: Any }: expr: #0, cost: 4923.000, children: [{ dist: Hash(t_10.a (#0)::Int32 NULL) }]
135+
│ │ └── { dist: Any }: expr: #0, cost: 4980.000, children: [{ dist: Hash(t_10.a (#0)::Int32 NULL) }]
136136
│ └── #0 Aggregate [#5]
137137
├── Group #7
138138
│ ├── Best properties
139-
│ │ └── { dist: Any }: expr: #0, cost: 4968.000, children: [{ dist: Any }]
139+
│ │ └── { dist: Any }: expr: #0, cost: 5030.000, children: [{ dist: Any }]
140140
│ └── #0 Aggregate [#6]
141141
├── Group #8
142142
│ ├── Best properties
143-
│ │ ├── { dist: Any }: expr: #0, cost: 4977.000, children: [{ dist: Any }]
144-
│ │ └── { dist: Serial }: expr: #4, cost: 8127.000, children: [{ dist: Any }]
143+
│ │ ├── { dist: Any }: expr: #0, cost: 5040.000, children: [{ dist: Any }]
144+
│ │ └── { dist: Serial }: expr: #4, cost: 8540.000, children: [{ dist: Any }]
145145
│ ├── #0 EvalScalar [#7]
146146
│ ├── #1 EvalScalar [#14]
147147
│ ├── #2 EvalScalar [#20]
@@ -166,16 +166,16 @@ Memo
166166
├── Group #12
167167
│ ├── Best properties
168168
│ │ ├── { dist: Any }: expr: #0, cost: 66410.000, children: [{ dist: Any }, { dist: Broadcast }]
169-
│ │ └── { dist: Hash(t_10.a (#0)::Int32 NULL) }: expr: #1, cost: 66869.000, children: [{ dist: Any }]
169+
│ │ └── { dist: Hash(t_10.a (#0)::Int32 NULL) }: expr: #1, cost: 66920.000, children: [{ dist: Any }]
170170
│ ├── #0 Join [#11, #3]
171171
│ └── #1 Exchange: (Hash(t_10.a (#0)::Int32 NULL)) [#12]
172172
├── Group #13
173173
│ ├── Best properties
174-
│ │ └── { dist: Any }: expr: #0, cost: 66914.000, children: [{ dist: Hash(t_10.a (#0)::Int32 NULL) }]
174+
│ │ └── { dist: Any }: expr: #0, cost: 66970.000, children: [{ dist: Hash(t_10.a (#0)::Int32 NULL) }]
175175
│ └── #0 Aggregate [#12]
176176
├── Group #14
177177
│ ├── Best properties
178-
│ │ └── { dist: Any }: expr: #0, cost: 66959.000, children: [{ dist: Any }]
178+
│ │ └── { dist: Any }: expr: #0, cost: 67020.000, children: [{ dist: Any }]
179179
│ └── #0 Aggregate [#13]
180180
├── Group #15
181181
│ ├── Best properties
@@ -197,35 +197,35 @@ Memo
197197
│ └── #0 Join [#0, #16]
198198
├── Group #18
199199
│ ├── Best properties
200-
│ │ ├── { dist: Any }: expr: #0, cost: 5029.000, children: [{ dist: Any }]
201-
│ │ └── { dist: Hash(t_10.a (#0)::Int32 NULL) }: expr: #1, cost: 5488.000, children: [{ dist: Any }]
200+
│ │ ├── { dist: Any }: expr: #0, cost: 5030.000, children: [{ dist: Any }]
201+
│ │ └── { dist: Hash(t_10.a (#0)::Int32 NULL) }: expr: #1, cost: 5540.000, children: [{ dist: Any }]
202202
│ ├── #0 EvalScalar [#17]
203203
│ └── #1 Exchange: (Hash(t_10.a (#0)::Int32 NULL)) [#18]
204204
├── Group #19
205205
│ ├── Best properties
206-
│ │ └── { dist: Any }: expr: #0, cost: 5533.000, children: [{ dist: Hash(t_10.a (#0)::Int32 NULL) }]
206+
│ │ └── { dist: Any }: expr: #0, cost: 5590.000, children: [{ dist: Hash(t_10.a (#0)::Int32 NULL) }]
207207
│ └── #0 Aggregate [#18]
208208
├── Group #20
209209
│ ├── Best properties
210-
│ │ └── { dist: Any }: expr: #0, cost: 5578.000, children: [{ dist: Any }]
210+
│ │ └── { dist: Any }: expr: #0, cost: 5640.000, children: [{ dist: Any }]
211211
│ └── #0 Aggregate [#19]
212212
├── Group #21
213213
│ ├── Best properties
214214
│ │ └── { dist: Any }: expr: #0, cost: 67020.000, children: [{ dist: Any }, { dist: Broadcast }]
215215
│ └── #0 Join [#11, #16]
216216
├── Group #22
217217
│ ├── Best properties
218-
│ │ ├── { dist: Any }: expr: #0, cost: 67029.000, children: [{ dist: Any }]
219-
│ │ └── { dist: Hash(t_10.a (#0)::Int32 NULL) }: expr: #1, cost: 67488.000, children: [{ dist: Any }]
218+
│ │ ├── { dist: Any }: expr: #0, cost: 67030.000, children: [{ dist: Any }]
219+
│ │ └── { dist: Hash(t_10.a (#0)::Int32 NULL) }: expr: #1, cost: 67540.000, children: [{ dist: Any }]
220220
│ ├── #0 EvalScalar [#21]
221221
│ └── #1 Exchange: (Hash(t_10.a (#0)::Int32 NULL)) [#22]
222222
├── Group #23
223223
│ ├── Best properties
224-
│ │ └── { dist: Any }: expr: #0, cost: 67533.000, children: [{ dist: Hash(t_10.a (#0)::Int32 NULL) }]
224+
│ │ └── { dist: Any }: expr: #0, cost: 67590.000, children: [{ dist: Hash(t_10.a (#0)::Int32 NULL) }]
225225
│ └── #0 Aggregate [#22]
226226
└── Group #24
227227
├── Best properties
228-
│ └── { dist: Any }: expr: #0, cost: 67578.000, children: [{ dist: Any }]
228+
│ └── { dist: Any }: expr: #0, cost: 67640.000, children: [{ dist: Any }]
229229
└── #0 Aggregate [#23]
230230

231231

tests/sqllogictests/suites/mode/cluster/memo/join_property.test

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ Memo
6262
│ └── #0 Join [#0, #3]
6363
└── Group #5
6464
├── Best properties
65-
│ ├── { dist: Any }: expr: #0, cost: 4419.000, children: [{ dist: Any }]
66-
│ └── { dist: Serial }: expr: #1, cost: 7569.000, children: [{ dist: Any }]
65+
│ ├── { dist: Any }: expr: #0, cost: 4420.000, children: [{ dist: Any }]
66+
│ └── { dist: Serial }: expr: #1, cost: 7920.000, children: [{ dist: Any }]
6767
├── #0 EvalScalar [#4]
6868
└── #1 Exchange: (Merge) [#5]
6969

@@ -192,8 +192,8 @@ Memo
192192
│ └── #0 Join [#2, #3]
193193
└── Group #5
194194
├── Best properties
195-
│ ├── { dist: Any }: expr: #0, cost: 112911.000, children: [{ dist: Any }]
196-
│ └── { dist: Serial }: expr: #1, cost: 494761.000, children: [{ dist: Any }]
195+
│ ├── { dist: Any }: expr: #0, cost: 112910.000, children: [{ dist: Any }]
196+
│ └── { dist: Serial }: expr: #1, cost: 494410.000, children: [{ dist: Any }]
197197
├── #0 EvalScalar [#4]
198198
└── #1 Exchange: (Merge) [#5]
199199

0 commit comments

Comments
 (0)