Skip to content

Commit d428794

Browse files
authored
[Refactor](block) remove all index_by_name usage (apache#57860)
This pull request refactors how columns are accessed by name in the `Block` class and related code, removing the internal name-to-index map and switching to linear search. It also removes temporary column handling and updates code throughout the backend to use the new column access pattern. Additionally, it improves error handling and updates some operator logic for schema scans. ### Block class refactor (core theme): * Removed the internal `index_by_name` map from the `Block` class, replacing all column name lookups with linear search via the new `get_position_by_name` method. This affects methods like `get_by_name`, `try_get_by_name`, `has`, and related insert/erase logic. [[1]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L85-R87) [[2]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L164-L193) [[3]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L204-L227) [[4]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L241-L247) [[5]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L259-L288) [[6]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L310-R268) [[7]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L774) [[8]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L795-L803) [[9]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L825-L835) * Removed the ability to erase columns by name and the logic for maintaining the name-to-index map, simplifying column management. [[1]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L259-L288) [[2]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L241-L247) [[3]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L795-L803) ### Temporary column handling (cleanup theme): * Removed the definition and usage of temporary block column prefixes and the method `erase_tmp_columns`, as well as related cleanup logic in scan operators. [[1]](diffhunk://#diff-d569a6c78638e9c539783057051e89bcb135cc254c763e3bc5e3a71e334df883L27-L28) [[2]](diffhunk://#diff-ac38473f151ccd99db4ed80dfbfa176f4f957ae2f5335ce8fbb4f5dbb0e932e9L1332-L1338) [[3]](diffhunk://#diff-76dba768c36c76cce660f7fe39514a98b509030a8976aabc9ac47d58bc923976L795-L803) ### Column access updates in backend logic (consistency theme): * Updated all code that previously accessed columns by name using the old map to use the new position-based access, including changes in partial update logic, push broker reader, and schema scan operator. [[1]](diffhunk://#diff-23fa0193d626ba712c4186c66bcd1809c7e55bfc04ea10f5a91c691ed3e04727L944-R947) [[2]](diffhunk://#diff-2ed235dda16244dccd76626375b4512b6ade1724933269c40a2953c29dd95c61L314-R314) [[3]](diffhunk://#diff-2ed235dda16244dccd76626375b4512b6ade1724933269c40a2953c29dd95c61L387-R388) [[4]](diffhunk://#diff-2ed235dda16244dccd76626375b4512b6ade1724933269c40a2953c29dd95c61L423-L443) [[5]](diffhunk://#diff-aa97e616efca2638565e6a58aa9295252714265fcc1ed904d503425ef4930ce7L481-R484) [[6]](diffhunk://#diff-aa97e616efca2638565e6a58aa9295252714265fcc1ed904d503425ef4930ce7L494-R494) [[7]](diffhunk://#diff-991f87a3acb91e58d14e3b30520de5dd07ecade8d4ecf885c02bf436873b4262R182) [[8]](diffhunk://#diff-991f87a3acb91e58d14e3b30520de5dd07ecade8d4ecf885c02bf436873b4262L253-R257) ### Schema scan operator improvements (feature theme): * Added a slot-to-column index mapping (`_slot_offsets`) to `SchemaScanOperatorX` for more efficient column mapping during scans. [[1]](diffhunk://#diff-82ea988d6a5179d6698c2844ad8f06167ebdb362225b07b35e9a5ba0a7bb173aR22-R23) [[2]](diffhunk://#diff-82ea988d6a5179d6698c2844ad8f06167ebdb362225b07b35e9a5ba0a7bb173aR90-R92) [[3]](diffhunk://#diff-991f87a3acb91e58d14e3b30520de5dd07ecade8d4ecf885c02bf436873b4262R182) [[4]](diffhunk://#diff-991f87a3acb91e58d14e3b30520de5dd07ecade8d4ecf885c02bf436873b4262L253-R257) ### Error handling and robustness (quality theme): * Improved error handling for missing columns, notably in partial update logic, by returning internal errors with block structure dumps when columns are not found. --- This refactor simplifies the `Block` class, improves code consistency, and enhances error handling across the backend.
1 parent 758f167 commit d428794

28 files changed

+144
-391
lines changed

be/src/common/consts.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ namespace BeConsts {
2424
const std::string CSV = "csv";
2525
const std::string CSV_WITH_NAMES = "csv_with_names";
2626
const std::string CSV_WITH_NAMES_AND_TYPES = "csv_with_names_and_types";
27-
const std::string BLOCK_TEMP_COLUMN_PREFIX = "__TEMP__";
28-
const std::string BLOCK_TEMP_COLUMN_SCANNER_FILTERED = "__TEMP__scanner_filtered";
2927
const std::string ROWID_COL = "__DORIS_ROWID_COL__";
3028
const std::string GLOBAL_ROWID_COL = "__DORIS_GLOBAL_ROWID_COL__";
3129
const std::string ROW_STORE_COL = "__DORIS_ROW_STORE_COL__";

be/src/olap/base_tablet.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -941,11 +941,10 @@ Status BaseTablet::fetch_value_by_rowids(RowsetSharedPtr input_rowset, uint32_t
941941

942942
const signed char* BaseTablet::get_delete_sign_column_data(const vectorized::Block& block,
943943
size_t rows_at_least) {
944-
if (const vectorized::ColumnWithTypeAndName* delete_sign_column =
945-
block.try_get_by_name(DELETE_SIGN);
946-
delete_sign_column != nullptr) {
944+
if (int pos = block.get_position_by_name(DELETE_SIGN); pos != -1) {
945+
const vectorized::ColumnWithTypeAndName& delete_sign_column = block.get_by_position(pos);
947946
const auto& delete_sign_col =
948-
reinterpret_cast<const vectorized::ColumnInt8&>(*(delete_sign_column->column));
947+
assert_cast<const vectorized::ColumnInt8&>(*(delete_sign_column.column));
949948
if (delete_sign_col.size() >= rows_at_least) {
950949
return delete_sign_col.get_data().data();
951950
}

be/src/olap/partial_update_info.cpp

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -311,9 +311,7 @@ Status FixedReadPlan::read_columns_by_plan(
311311
const signed char* __restrict cur_delete_signs) const {
312312
if (force_read_old_delete_signs) {
313313
// always read delete sign column from historical data
314-
if (const vectorized::ColumnWithTypeAndName* old_delete_sign_column =
315-
block.try_get_by_name(DELETE_SIGN);
316-
old_delete_sign_column == nullptr) {
314+
if (block.get_position_by_name(DELETE_SIGN) == -1) {
317315
auto del_col_cid = tablet_schema.field_index(DELETE_SIGN);
318316
cids_to_read.emplace_back(del_col_cid);
319317
block.swap(tablet_schema.create_block_by_cids(cids_to_read));
@@ -384,7 +382,10 @@ Status FixedReadPlan::fill_missing_columns(
384382
old_value_block, &read_index, true, nullptr));
385383

386384
const auto* old_delete_signs = BaseTablet::get_delete_sign_column_data(old_value_block);
387-
DCHECK(old_delete_signs != nullptr);
385+
if (old_delete_signs == nullptr) {
386+
return Status::InternalError("old delete signs column not found, block: {}",
387+
old_value_block.dump_structure());
388+
}
388389
// build default value columns
389390
auto default_value_block = old_value_block.clone_empty();
390391
RETURN_IF_ERROR(BaseTablet::generate_default_value_block(
@@ -420,27 +421,27 @@ Status FixedReadPlan::fill_missing_columns(
420421
}
421422

422423
if (should_use_default) {
423-
// clang-format off
424424
if (tablet_column.has_default_value()) {
425425
missing_col->insert_from(*mutable_default_value_columns[i], 0);
426426
} else if (tablet_column.is_nullable()) {
427-
auto* nullable_column = assert_cast<vectorized::ColumnNullable*, TypeCheckOnRelease::DISABLE>(missing_col.get());
427+
auto* nullable_column =
428+
assert_cast<vectorized::ColumnNullable*>(missing_col.get());
428429
nullable_column->insert_many_defaults(1);
429430
} else if (tablet_schema.auto_increment_column() == tablet_column.name()) {
430-
const auto& column = *DORIS_TRY(rowset_ctx->tablet_schema->column(tablet_column.name()));
431+
const auto& column =
432+
*DORIS_TRY(rowset_ctx->tablet_schema->column(tablet_column.name()));
431433
DCHECK(column.type() == FieldType::OLAP_FIELD_TYPE_BIGINT);
432434
auto* auto_inc_column =
433-
assert_cast<vectorized::ColumnInt64*, TypeCheckOnRelease::DISABLE>(missing_col.get());
434-
auto_inc_column->insert(vectorized::Field::create_field<TYPE_BIGINT>(
435-
assert_cast<const vectorized::ColumnInt64*, TypeCheckOnRelease::DISABLE>(
436-
block->get_by_name(BeConsts::PARTIAL_UPDATE_AUTO_INC_COL).column.get())->get_element(idx)));
435+
assert_cast<vectorized::ColumnInt64*>(missing_col.get());
436+
auto_inc_column->insert_from(
437+
*block->get_by_name(BeConsts::PARTIAL_UPDATE_AUTO_INC_COL).column.get(),
438+
idx);
437439
} else {
438440
// If the control flow reaches this branch, the column neither has default value
439441
// nor is nullable. It means that the row's delete sign is marked, and the value
440442
// columns are useless and won't be read. So we can just put arbitary values in the cells
441443
missing_col->insert(tablet_column.get_vec_type()->get_default());
442444
}
443-
// clang-format on
444445
} else {
445446
missing_col->insert_from(*old_value_block.get_by_position(i).column,
446447
pos_in_old_block);

be/src/olap/push_handler.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -478,10 +478,10 @@ Status PushBrokerReader::_cast_to_input_block() {
478478
if (slot_desc->type()->get_primitive_type() == PrimitiveType::TYPE_VARIANT) {
479479
continue;
480480
}
481-
auto& arg = _src_block_ptr->get_by_name(slot_desc->col_name());
482481
// remove nullable here, let the get_function decide whether nullable
483482
auto return_type = slot_desc->get_data_type_ptr();
484483
idx = _src_block_name_to_idx[slot_desc->col_name()];
484+
auto& arg = _src_block_ptr->get_by_position(idx);
485485
// bitmap convert:src -> to_base64 -> bitmap_from_base64
486486
if (slot_desc->type()->get_primitive_type() == TYPE_BITMAP) {
487487
auto base64_return_type = vectorized::DataTypeFactory::instance().create_data_type(
@@ -491,7 +491,7 @@ Status PushBrokerReader::_cast_to_input_block() {
491491
RETURN_IF_ERROR(func_to_base64->execute(nullptr, *_src_block_ptr, {idx}, idx,
492492
arg.column->size()));
493493
_src_block_ptr->get_by_position(idx).type = std::move(base64_return_type);
494-
auto& arg_base64 = _src_block_ptr->get_by_name(slot_desc->col_name());
494+
auto& arg_base64 = _src_block_ptr->get_by_position(idx);
495495
auto func_bitmap_from_base64 =
496496
vectorized::SimpleFunctionFactory::instance().get_function(
497497
"bitmap_from_base64", {arg_base64}, return_type);

be/src/olap/tablet_schema.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,6 @@ class TabletSchema : public MetadataAdder<TabletSchema> {
463463
void set_skip_write_index_on_load(bool skip) { _skip_write_index_on_load = skip; }
464464
bool skip_write_index_on_load() const { return _skip_write_index_on_load; }
465465
int32_t delete_sign_idx() const { return _delete_sign_idx; }
466-
void set_delete_sign_idx(int32_t delete_sign_idx) { _delete_sign_idx = delete_sign_idx; }
467466
bool has_sequence_col() const { return _sequence_col_idx != -1; }
468467
int32_t sequence_col_idx() const { return _sequence_col_idx; }
469468
void set_version_col_idx(int32_t version_col_idx) { _version_col_idx = version_col_idx; }

be/src/pipeline/exec/scan_operator.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,13 +1332,6 @@ Status ScanOperatorX<LocalStateType>::get_block(RuntimeState* state, vectorized:
13321332
bool* eos) {
13331333
auto& local_state = get_local_state(state);
13341334
SCOPED_TIMER(local_state.exec_time_counter());
1335-
// in inverted index apply logic, in order to optimize query performance,
1336-
// we built some temporary columns into block, these columns only used in scan node level,
1337-
// remove them when query leave scan node to avoid other nodes use block->columns() to make a wrong decision
1338-
Defer drop_block_temp_column {[&]() {
1339-
std::unique_lock l(local_state._block_lock);
1340-
block->erase_tmp_columns();
1341-
}};
13421335

13431336
if (state->is_cancelled()) {
13441337
if (local_state._scanner_ctx) {

be/src/pipeline/exec/schema_scan_operator.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ Status SchemaScanOperatorX::prepare(RuntimeState* state) {
179179
int j = 0;
180180
for (; j < columns_desc.size(); ++j) {
181181
if (boost::iequals(_dest_tuple_desc->slots()[i]->col_name(), columns_desc[j].name)) {
182+
_slot_offsets[i] = j;
182183
break;
183184
}
184185
}
@@ -250,11 +251,10 @@ Status SchemaScanOperatorX::get_block(RuntimeState* state, vectorized::Block* bl
250251
if (src_block.rows()) {
251252
// block->check_number_of_rows();
252253
for (int i = 0; i < _slot_num; ++i) {
253-
auto* dest_slot_desc = _dest_tuple_desc->slots()[i];
254254
vectorized::MutableColumnPtr column_ptr =
255255
std::move(*block->get_by_position(i).column).mutate();
256256
column_ptr->insert_range_from(
257-
*src_block.get_by_name(dest_slot_desc->col_name()).column, 0,
257+
*src_block.safe_get_by_position(_slot_offsets[i]).column, 0,
258258
src_block.rows());
259259
}
260260
RETURN_IF_ERROR(local_state.filter_block(local_state._conjuncts, block,

be/src/pipeline/exec/schema_scan_operator.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
#include <stdint.h>
2121

22+
#include <unordered_map>
23+
2224
#include "common/status.h"
2325
#include "exec/schema_scanner.h"
2426
#include "operator.h"
@@ -85,6 +87,9 @@ class SchemaScanOperatorX final : public OperatorX<SchemaScanLocalState> {
8587
int _tuple_idx;
8688
// slot num need to fill in and return
8789
int _slot_num;
90+
91+
// slot index mapping to src column index
92+
std::unordered_map<int, int> _slot_offsets;
8893
};
8994

9095
#include "common/compile_check_end.h"

0 commit comments

Comments
 (0)