Skip to content

Commit e0b608d

Browse files
authored
refactor(query): Refactor and optimize virtual column refresh (#17950)
* fix(query): fix refresh virtual column * fix * add table lock
1 parent b015bd9 commit e0b608d

File tree

14 files changed

+197
-94
lines changed

14 files changed

+197
-94
lines changed

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/common/exception/src/exception_code.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,8 +271,8 @@ build_exceptions! {
271271
LicenceDenied(1112),
272272
/// Unknown datamask
273273
UnknownDatamask(1113),
274-
/// Virtual column not found
275-
VirtualColumnNotFound(1115),
274+
/// Virtual column error
275+
VirtualColumnError(1115),
276276
/// Virtual column already exists
277277
VirtualColumnAlreadyExists(1116),
278278
/// Column referenced by computed column

src/query/ee/src/storages/fuse/operations/virtual_columns.rs

Lines changed: 87 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
use std::collections::HashMap;
1516
use std::collections::VecDeque;
1617
use std::sync::Arc;
1718
use std::time::Instant;
@@ -21,8 +22,10 @@ use databend_common_catalog::table::Table;
2122
use databend_common_catalog::table_context::TableContext;
2223
use databend_common_exception::Result;
2324
use databend_common_expression::BlockMetaInfoDowncast;
25+
use databend_common_expression::ComputedExpr;
2426
use databend_common_expression::DataBlock;
2527
use databend_common_expression::TableDataType;
28+
use databend_common_expression::TableSchema;
2629
use databend_common_metrics::storage::metrics_inc_block_virtual_column_write_bytes;
2730
use databend_common_metrics::storage::metrics_inc_block_virtual_column_write_milliseconds;
2831
use databend_common_metrics::storage::metrics_inc_block_virtual_column_write_nums;
@@ -31,18 +34,26 @@ use databend_common_pipeline_sources::AsyncSource;
3134
use databend_common_pipeline_sources::AsyncSourcer;
3235
use databend_common_pipeline_transforms::processors::AsyncTransform;
3336
use databend_common_pipeline_transforms::processors::TransformPipelineHelper;
37+
use databend_common_sql::executor::physical_plans::MutationKind;
3438
use databend_common_storages_fuse::io::write_data;
3539
use databend_common_storages_fuse::io::BlockReader;
3640
use databend_common_storages_fuse::io::MetaReaders;
3741
use databend_common_storages_fuse::io::VirtualColumnBuilder;
3842
use databend_common_storages_fuse::io::WriteSettings;
43+
use databend_common_storages_fuse::operations::BlockMetaIndex;
44+
use databend_common_storages_fuse::operations::CommitSink;
45+
use databend_common_storages_fuse::operations::MutationGenerator;
46+
use databend_common_storages_fuse::operations::MutationLogEntry;
47+
use databend_common_storages_fuse::operations::MutationLogs;
48+
use databend_common_storages_fuse::operations::TableMutationAggregator;
3949
use databend_common_storages_fuse::FuseStorageFormat;
4050
use databend_common_storages_fuse::FuseTable;
4151
use databend_storages_common_cache::LoadParams;
4252
use databend_storages_common_io::ReadSettings;
4353
use databend_storages_common_table_meta::meta::BlockMeta;
4454
use databend_storages_common_table_meta::meta::ExtendedBlockMeta;
4555
use databend_storages_common_table_meta::meta::Location;
56+
use databend_storages_common_table_meta::meta::Statistics;
4657
use opendal::Operator;
4758

4859
// The big picture of refresh virtual column into pipeline:
@@ -63,38 +74,33 @@ use opendal::Operator;
6374
pub async fn do_refresh_virtual_column(
6475
ctx: Arc<dyn TableContext>,
6576
fuse_table: &FuseTable,
66-
segment_locs: Option<Vec<Location>>,
6777
pipeline: &mut Pipeline,
6878
) -> Result<()> {
69-
let snapshot_opt = fuse_table.read_table_snapshot().await?;
70-
let snapshot = if let Some(val) = snapshot_opt {
71-
val
72-
} else {
79+
let Some(snapshot) = fuse_table.read_table_snapshot().await? else {
7380
// no snapshot
7481
return Ok(());
7582
};
76-
7783
let table_schema = &fuse_table.get_table_info().meta.schema;
7884

7985
// Collect source fields used by virtual columns.
86+
let mut fields = Vec::new();
8087
let mut field_indices = Vec::new();
8188
for (i, f) in table_schema.fields().iter().enumerate() {
82-
if f.data_type().remove_nullable() != TableDataType::Variant {
89+
if f.data_type().remove_nullable() != TableDataType::Variant
90+
|| matches!(f.computed_expr(), Some(ComputedExpr::Virtual(_)))
91+
{
8392
continue;
8493
}
94+
fields.push(f.clone());
8595
field_indices.push(i);
8696
}
8797

88-
if field_indices.is_empty() {
89-
// no source variant column
90-
return Ok(());
91-
}
92-
93-
let table_info = &fuse_table.get_table_info();
94-
let Some(virtual_column_builder) = VirtualColumnBuilder::try_create(ctx.clone(), table_info)
95-
else {
96-
return Ok(());
97-
};
98+
let source_schema = Arc::new(TableSchema {
99+
fields,
100+
..fuse_table.schema().as_ref().clone()
101+
});
102+
let virtual_column_builder =
103+
VirtualColumnBuilder::try_create(ctx.clone(), fuse_table, source_schema)?;
98104

99105
let projection = Projection::Columns(field_indices);
100106
let block_reader =
@@ -108,16 +114,11 @@ pub async fn do_refresh_virtual_column(
108114

109115
let operator = fuse_table.get_operator_ref();
110116

111-
// If no segment locations are specified, iterates through all segments
112-
let segment_locs = if let Some(segment_locs) = segment_locs {
113-
segment_locs
114-
} else {
115-
snapshot.segments.clone()
116-
};
117-
118-
// Read source variant columns and extract inner fields as virtual columns.
117+
// Iterates through all segments and collect blocks don't have virtual block meta.
118+
let segment_locs = snapshot.segments.clone();
119119
let mut block_metas = VecDeque::new();
120-
for (location, ver) in segment_locs {
120+
let mut block_meta_index_map = HashMap::new();
121+
for (segment_idx, (location, ver)) in segment_locs.into_iter().enumerate() {
121122
let segment_info = segment_reader
122123
.read(&LoadParams {
123124
location: location.to_string(),
@@ -127,10 +128,15 @@ pub async fn do_refresh_virtual_column(
127128
})
128129
.await?;
129130

130-
for block_meta in segment_info.block_metas()? {
131+
for (block_idx, block_meta) in segment_info.block_metas()?.into_iter().enumerate() {
131132
if block_meta.virtual_block_meta.is_some() {
132133
continue;
133134
}
135+
let index = BlockMetaIndex {
136+
segment_idx,
137+
block_idx,
138+
};
139+
block_meta_index_map.insert(block_meta.location.clone(), index);
134140
block_metas.push_back(block_meta);
135141
}
136142
}
@@ -139,6 +145,7 @@ pub async fn do_refresh_virtual_column(
139145
return Ok(());
140146
}
141147

148+
// Read source blocks.
142149
let settings = ReadSettings::from_ctx(&ctx)?;
143150
pipeline.add_source(
144151
|output| {
@@ -153,31 +160,52 @@ pub async fn do_refresh_virtual_column(
153160
1,
154161
)?;
155162

163+
// Extract inner fields as virtual columns and write virtual block data.
156164
let block_nums = block_metas.len();
157165
let max_threads = ctx.get_settings().get_max_threads()? as usize;
158166
let max_threads = std::cmp::min(block_nums, max_threads);
159167
pipeline.try_resize(max_threads)?;
160168
pipeline.add_async_transformer(|| {
161169
VirtualColumnTransform::new(
170+
operator.clone(),
162171
write_settings.clone(),
172+
block_meta_index_map.clone(),
163173
virtual_column_builder.clone(),
164-
operator.clone(),
165174
)
166175
});
167176

168-
let base_snapshot = fuse_table.read_table_snapshot().await?;
169-
let table_meta_timestamps = ctx.get_table_meta_timestamps(fuse_table, base_snapshot.clone())?;
170-
171-
fuse_table.do_commit(
172-
ctx,
173-
pipeline,
174-
None,
175-
vec![],
176-
false,
177-
None,
178-
None,
179-
table_meta_timestamps,
180-
)?;
177+
pipeline.try_resize(1)?;
178+
let table_meta_timestamps =
179+
ctx.get_table_meta_timestamps(fuse_table, Some(snapshot.clone()))?;
180+
pipeline.add_async_accumulating_transformer(|| {
181+
TableMutationAggregator::create(
182+
fuse_table,
183+
ctx.clone(),
184+
vec![],
185+
vec![],
186+
vec![],
187+
Statistics::default(),
188+
MutationKind::Update,
189+
table_meta_timestamps,
190+
)
191+
});
192+
193+
let prev_snapshot_id = snapshot.snapshot_id;
194+
let snapshot_gen = MutationGenerator::new(Some(snapshot), MutationKind::Update);
195+
pipeline.add_sink(|input| {
196+
CommitSink::try_create(
197+
fuse_table,
198+
ctx.clone(),
199+
None,
200+
vec![],
201+
snapshot_gen.clone(),
202+
input,
203+
None,
204+
Some(prev_snapshot_id),
205+
None,
206+
table_meta_timestamps,
207+
)
208+
})?;
181209

182210
Ok(())
183211
}
@@ -238,21 +266,24 @@ impl AsyncSource for VirtualColumnSource {
238266

239267
/// `VirtualColumnTransform` is used to generate virtual columns for each blocks.
240268
pub struct VirtualColumnTransform {
269+
operator: Operator,
241270
write_settings: WriteSettings,
271+
block_meta_index_map: HashMap<Location, BlockMetaIndex>,
242272
virtual_column_builder: VirtualColumnBuilder,
243-
operator: Operator,
244273
}
245274

246275
impl VirtualColumnTransform {
247276
pub fn new(
277+
operator: Operator,
248278
write_settings: WriteSettings,
279+
block_meta_index_map: HashMap<Location, BlockMetaIndex>,
249280
virtual_column_builder: VirtualColumnBuilder,
250-
operator: Operator,
251281
) -> Self {
252282
Self {
283+
operator,
253284
write_settings,
285+
block_meta_index_map,
254286
virtual_column_builder,
255-
operator,
256287
}
257288
}
258289
}
@@ -301,13 +332,23 @@ impl AsyncTransform for VirtualColumnTransform {
301332
}
302333
}
303334

335+
let block_meta_index = self
336+
.block_meta_index_map
337+
.remove(&block_meta.location)
338+
.unwrap();
304339
let extended_block_meta = ExtendedBlockMeta {
305340
block_meta: block_meta.clone(),
306341
draft_virtual_block_meta: Some(virtual_column_state.draft_virtual_block_meta),
307342
};
308343

309-
let new_block = DataBlock::new(vec![], 0);
310-
let new_block = new_block.add_meta(Some(extended_block_meta.boxed()))?;
344+
let entry = MutationLogEntry::ReplacedBlock {
345+
index: block_meta_index,
346+
block_meta: Arc::new(extended_block_meta),
347+
};
348+
let meta = MutationLogs {
349+
entries: vec![entry],
350+
};
351+
let new_block = DataBlock::empty_with_meta(Box::new(meta));
311352
Ok(new_block)
312353
}
313354
}

src/query/ee/src/virtual_column/virtual_column_handler.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ use databend_common_pipeline_core::Pipeline;
2121
use databend_common_storages_fuse::FuseTable;
2222
use databend_enterprise_virtual_column::VirtualColumnHandler;
2323
use databend_enterprise_virtual_column::VirtualColumnHandlerWrapper;
24-
use databend_storages_common_table_meta::meta::Location;
2524

2625
use crate::storages::fuse::do_refresh_virtual_column;
2726

@@ -33,10 +32,9 @@ impl VirtualColumnHandler for RealVirtualColumnHandler {
3332
&self,
3433
ctx: Arc<dyn TableContext>,
3534
fuse_table: &FuseTable,
36-
segment_locs: Option<Vec<Location>>,
3735
pipeline: &mut Pipeline,
3836
) -> Result<()> {
39-
do_refresh_virtual_column(ctx, fuse_table, segment_locs, pipeline).await
37+
do_refresh_virtual_column(ctx, fuse_table, pipeline).await
4038
}
4139
}
4240

src/query/ee/tests/it/storages/fuse/operations/virtual_columns.rs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,7 @@ async fn test_fuse_do_refresh_virtual_column() -> Result<()> {
5555
let snapshot = snapshot_opt.unwrap();
5656

5757
let mut build_res = PipelineBuildResult::create();
58-
let segment_locs = Some(snapshot.segments.clone());
59-
do_refresh_virtual_column(
60-
table_ctx.clone(),
61-
fuse_table,
62-
segment_locs,
63-
&mut build_res.main_pipeline,
64-
)
65-
.await?;
58+
do_refresh_virtual_column(table_ctx.clone(), fuse_table, &mut build_res.main_pipeline).await?;
6659

6760
let settings = table_ctx.get_settings();
6861
build_res.set_max_threads(settings.get_max_threads()? as usize);

src/query/ee/tests/it/storages/fuse/operations/virtual_columns_builder.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ async fn test_virtual_column_builder() -> Result<()> {
4848

4949
let table = fixture.latest_default_table().await?;
5050
let table_info = table.get_table_info();
51+
let schema = table_info.meta.schema.clone();
5152

5253
let fuse_table = FuseTable::try_from_table(table.as_ref())?;
5354

@@ -57,7 +58,7 @@ async fn test_virtual_column_builder() -> Result<()> {
5758
0,
5859
); // Dummy location
5960

60-
let builder = VirtualColumnBuilder::try_create(ctx, table_info).unwrap();
61+
let builder = VirtualColumnBuilder::try_create(ctx, fuse_table, schema).unwrap();
6162

6263
let block = DataBlock::new(
6364
vec![

src/query/ee_features/virtual_column/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ databend-common-catalog = { workspace = true }
1616
databend-common-exception = { workspace = true }
1717
databend-common-pipeline-core = { workspace = true }
1818
databend-common-storages-fuse = { workspace = true }
19-
databend-storages-common-table-meta = { workspace = true }
2019

2120
[build-dependencies]
2221

src/query/ee_features/virtual_column/src/virtual_column.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,13 @@ use databend_common_catalog::table_context::TableContext;
1919
use databend_common_exception::Result;
2020
use databend_common_pipeline_core::Pipeline;
2121
use databend_common_storages_fuse::FuseTable;
22-
use databend_storages_common_table_meta::meta::Location;
2322

2423
#[async_trait::async_trait]
2524
pub trait VirtualColumnHandler: Sync + Send {
2625
async fn do_refresh_virtual_column(
2726
&self,
2827
ctx: Arc<dyn TableContext>,
2928
fuse_table: &FuseTable,
30-
segment_locs: Option<Vec<Location>>,
3129
pipeline: &mut Pipeline,
3230
) -> Result<()>;
3331
}
@@ -46,11 +44,10 @@ impl VirtualColumnHandlerWrapper {
4644
&self,
4745
ctx: Arc<dyn TableContext>,
4846
fuse_table: &FuseTable,
49-
segment_locs: Option<Vec<Location>>,
5047
pipeline: &mut Pipeline,
5148
) -> Result<()> {
5249
self.handler
53-
.do_refresh_virtual_column(ctx, fuse_table, segment_locs, pipeline)
50+
.do_refresh_virtual_column(ctx, fuse_table, pipeline)
5451
.await
5552
}
5653
}

0 commit comments

Comments
 (0)