15
15
use std:: collections:: HashMap ;
16
16
use std:: sync:: Arc ;
17
17
18
- use chrono :: Utc ;
18
+ use databend_common_catalog :: plan :: PartitionsShuffleKind ;
19
19
use databend_common_catalog:: table:: TableExt ;
20
20
use databend_common_exception:: Result ;
21
21
use databend_common_pipeline_core:: processors:: ProcessorPtr ;
@@ -26,20 +26,19 @@ use databend_common_sql::plans::Plan;
26
26
use databend_common_sql:: BindContext ;
27
27
use databend_common_sql:: Planner ;
28
28
use databend_common_storage:: DEFAULT_HISTOGRAM_BUCKETS ;
29
- use databend_common_storages_factory:: NavigationPoint ;
30
29
use databend_common_storages_factory:: Table ;
31
30
use databend_common_storages_fuse:: operations:: AnalyzeLightMutator ;
32
31
use databend_common_storages_fuse:: operations:: HistogramInfoSink ;
32
+ use databend_common_storages_fuse:: FuseLazyPartInfo ;
33
33
use databend_common_storages_fuse:: FuseTable ;
34
+ use databend_storages_common_cache:: Partitions ;
34
35
use databend_storages_common_index:: Index ;
35
36
use databend_storages_common_index:: RangeIndex ;
36
- use itertools:: Itertools ;
37
37
use log:: info;
38
38
39
39
use crate :: interpreters:: Interpreter ;
40
40
use crate :: pipelines:: PipelineBuildResult ;
41
41
use crate :: schedulers:: build_query_pipeline;
42
- use crate :: schedulers:: build_query_pipeline_without_render_result_set;
43
42
use crate :: sessions:: QueryContext ;
44
43
use crate :: sessions:: TableContext ;
45
44
@@ -133,87 +132,23 @@ impl Interpreter for AnalyzeTableInterpreter {
133
132
return Ok ( PipelineBuildResult :: create ( ) ) ;
134
133
}
135
134
136
- let table_statistics = table
137
- . read_table_snapshot_statistics ( Some ( & snapshot) )
138
- . await ?;
139
- if let Some ( table_statistics) = & table_statistics {
140
- if table_statistics. snapshot_id == snapshot. snapshot_id {
141
- return Ok ( PipelineBuildResult :: create ( ) ) ;
142
- }
135
+ let mut parts = Vec :: with_capacity ( snapshot. segments . len ( ) ) ;
136
+ for ( idx, segment_location) in snapshot. segments . iter ( ) . enumerate ( ) {
137
+ parts. push ( FuseLazyPartInfo :: create ( idx, segment_location. clone ( ) ) ) ;
143
138
}
139
+ self . ctx
140
+ . set_partitions ( Partitions :: create ( PartitionsShuffleKind :: Mod , parts) ) ?;
144
141
145
- // plan sql
146
- let ( is_full, temporal_str) = if let Some ( table_statistics) = & table_statistics {
147
- let is_full = match table
148
- . navigate_to_point (
149
- & NavigationPoint :: SnapshotID ( table_statistics. snapshot_id . simple ( ) . to_string ( ) ) ,
150
- self . ctx . clone ( ) . get_abort_checker ( ) ,
151
- )
152
- . await
153
- {
154
- Ok ( t) => !t
155
- . read_table_snapshot ( )
156
- . await
157
- . is_ok_and ( |s| s. is_some_and ( |s| s. prev_table_seq . is_some ( ) ) ) ,
158
- Err ( _) => true ,
159
- } ;
160
-
161
- let temporal_str = if is_full {
162
- format ! ( "AT (snapshot => '{}')" , snapshot. snapshot_id. simple( ) )
163
- } else {
164
- // analyze only need to collect the added blocks.
165
- let table_alias = format ! ( "_change_insert${:08x}" , Utc :: now( ) . timestamp( ) ) ;
166
- format ! (
167
- "CHANGES(INFORMATION => DEFAULT) AT (snapshot => '{}') END (snapshot => '{}') AS {table_alias}" ,
168
- table_statistics. snapshot_id. simple( ) ,
169
- snapshot. snapshot_id. simple( ) ,
170
- )
171
- } ;
172
- ( is_full, temporal_str)
173
- } else {
174
- (
175
- true ,
176
- format ! ( "AT (snapshot => '{}')" , snapshot. snapshot_id. simple( ) ) ,
177
- )
178
- } ;
179
-
142
+ let mut build_res = PipelineBuildResult :: create ( ) ;
143
+ // After profiling, computing histogram is heavy and the bottleneck is window function(90%).
144
+ // It's possible to OOM if the table is too large and spilling isn't enabled.
145
+ // We add a setting `enable_analyze_histogram` to control whether to compute histogram(default is closed).
146
+ let mut histogram_info_receivers = HashMap :: new ( ) ;
180
147
let quote = self
181
148
. ctx
182
149
. get_settings ( )
183
150
. get_sql_dialect ( ) ?
184
151
. default_ident_quote ( ) ;
185
-
186
- // 0.01625 --> 12 buckets --> 4K size per column
187
- // 1.04 / math.sqrt(1<<12) --> 0.01625
188
- const DISTINCT_ERROR_RATE : f64 = 0.01625 ;
189
- let ndv_select_expr = snapshot
190
- . schema
191
- . fields ( )
192
- . iter ( )
193
- . filter ( |f| RangeIndex :: supported_type ( & f. data_type ( ) . into ( ) ) )
194
- . map ( |f| {
195
- format ! (
196
- "approx_count_distinct_state({DISTINCT_ERROR_RATE})({quote}{}{quote}) as ndv_{}" ,
197
- f. name,
198
- f. column_id( )
199
- )
200
- } )
201
- . join ( ", " ) ;
202
-
203
- let sql = format ! (
204
- "SELECT {ndv_select_expr}, {is_full} as is_full from {}.{} {temporal_str}" ,
205
- plan. database, plan. table,
206
- ) ;
207
-
208
- info ! ( "Analyze via sql: {sql}" ) ;
209
-
210
- let ( physical_plan, bind_context) = self . plan_sql ( sql, false ) . await ?;
211
- let mut build_res =
212
- build_query_pipeline_without_render_result_set ( & self . ctx , & physical_plan) . await ?;
213
- // After profiling, computing histogram is heavy and the bottleneck is window function(90%).
214
- // It's possible to OOM if the table is too large and spilling isn't enabled.
215
- // We add a setting `enable_analyze_histogram` to control whether to compute histogram(default is closed).
216
- let mut histogram_info_receivers = HashMap :: new ( ) ;
217
152
if self . ctx . get_settings ( ) . get_enable_analyze_histogram ( ) ? {
218
153
let histogram_sqls = table
219
154
. schema ( )
@@ -269,10 +204,7 @@ impl Interpreter for AnalyzeTableInterpreter {
269
204
}
270
205
FuseTable :: do_analyze (
271
206
self . ctx . clone ( ) ,
272
- bind_context. output_schema ( ) ,
273
- & self . plan . catalog ,
274
- & self . plan . database ,
275
- & self . plan . table ,
207
+ table,
276
208
snapshot. snapshot_id ,
277
209
& mut build_res. main_pipeline ,
278
210
histogram_info_receivers,
0 commit comments