From 88edf2bd06ac3f63802f5bf7abd7e2852e624576 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Mon, 11 May 2026 20:42:09 +0800 Subject: [PATCH 01/11] [fix](be) Restore COW ownership in mutable column paths Issue Number: N/A Related PR: #63001 Problem Summary: Restore ownership-safe COW mutation after assume_mutable became an assertion, and preserve shared immutable subcolumns for return-new wrappers to avoid unnecessary deep copies. None - Test: Unit Test / Manual test - ./run-be-ut.sh --run --filter=ColumnArrayTest.SharedCreateValidatesOffsetsAndDataSize:ColumnNullableTest.SharedCreatePreservesImmutableSubcolumns:ColumnMapTest2.SharedCreatePreservesImmutableSubcolumns:ColumnMapTest2.ConstFilterAndPermuteKeepInputAliasesUntouched:ColumnMapTest2.DeduplicateNestedNullableMapValuesDetachesSharedValueColumn:ComplexTypeTest.DeserializeStructWritesBackSharedChildren:VariantUtilTest.ParseNullableScalarVariantDetachesNestedAlias -j 100 - PATH=/mnt/disk6/common/ldb_toolchain_toucan/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.bun/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.codex/tmp/arg0/codex-arg083BlKk:/mnt/disk6/common/node-v24.14.1-linux-x64/lib/node_modules/@openai/codex/node_modules/@openai/codex-linux-x64/vendor/x86_64-unknown-linux-musl/path:/mnt/disk3/zhaochangle/.bun/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk3/zhaochangle/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/usr/share/Modules/bin:/usr/lib64/ccache:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin build-support/check-format.sh - git diff --check - ./build.sh --be -j 100 - Behavior changed: No - Does this need documentation: No --- be/src/core/block/block.cpp | 33 +- be/src/core/block/block.h | 1 + be/src/core/column/column.cpp | 5 +- be/src/core/column/column.h | 12 +- be/src/core/column/column_array.cpp | 69 ++- be/src/core/column/column_array.h | 19 +- be/src/core/column/column_const.cpp | 14 +- be/src/core/column/column_const.h | 9 +- be/src/core/column/column_map.cpp | 159 ++++--- be/src/core/column/column_map.h | 5 +- be/src/core/column/column_nullable.cpp | 67 ++- be/src/core/column/column_nullable.h | 5 +- be/src/core/column/column_varbinary.h | 7 +- be/src/core/column/column_variant.cpp | 97 +++-- be/src/core/column/column_variant.h | 8 +- be/src/core/cow.h | 17 +- be/src/core/data_type/data_type_array.cpp | 3 +- be/src/core/data_type/data_type_map.cpp | 8 +- be/src/core/data_type/data_type_struct.cpp | 3 +- .../common/arrow_column_to_doris_column.cpp | 10 +- .../data_gen_functions/vnumbers_tvf.cpp | 4 +- .../exec/common/hash_table/hash_map_context.h | 2 +- be/src/exec/common/variant_util.cpp | 24 +- be/src/exec/exchange/local_exchanger.cpp | 7 +- .../operator/aggregation_sink_operator.cpp | 17 +- .../operator/aggregation_source_operator.cpp | 302 ++++++------- .../bucketed_aggregation_sink_operator.cpp | 5 +- .../bucketed_aggregation_source_operator.cpp | 32 +- .../exec/operator/cache_source_operator.cpp | 8 +- ...istinct_streaming_aggregation_operator.cpp | 20 +- be/src/exec/operator/hashjoin_build_sink.cpp | 4 +- be/src/exec/operator/hashjoin_build_sink.h | 2 +- .../join/process_hash_table_probe_impl.h | 27 +- .../nested_loop_join_probe_operator.cpp | 4 +- be/src/exec/operator/operator.cpp | 4 +- be/src/exec/operator/repeat_operator.cpp | 2 + be/src/exec/operator/schema_scan_operator.cpp | 15 +- be/src/exec/operator/set_source_operator.cpp | 5 +- .../streaming_aggregation_operator.cpp | 42 +- .../exec/operator/table_function_operator.cpp | 1 + be/src/exec/operator/union_sink_operator.h | 3 +- .../exec/operator/union_source_operator.cpp | 3 + be/src/exec/rowid_fetcher.cpp | 31 +- be/src/exec/scan/file_scanner.cpp | 16 +- be/src/exec/scan/meta_scanner.cpp | 19 +- be/src/exec/scan/scanner.cpp | 2 +- be/src/exec/scan/scanner.h | 5 +- be/src/exec/sink/vtablet_block_convertor.cpp | 27 +- be/src/exec/sort/partition_sorter.cpp | 4 + be/src/exec/sort/vsorted_run_merger.cpp | 1 + .../aggregate/aggregate_function_java_udaf.h | 9 +- .../aggregate/aggregate_function_null_v2.h | 3 +- .../exprs/aggregate/aggregate_function_sort.h | 62 ++- .../function/array/function_array_flatten.cpp | 14 +- be/src/exprs/function/cast/cast_to_variant.h | 31 +- be/src/exprs/function/function.cpp | 9 +- be/src/exprs/function/function_bitmap.cpp | 10 +- .../function/function_variant_element.cpp | 21 +- .../table_function/python_udtf_function.cpp | 7 +- .../table_function/udf_table_function.cpp | 4 +- be/src/exprs/table_function/vexplode.cpp | 10 +- be/src/exprs/table_function/vexplode_v2.cpp | 11 +- be/src/exprs/vcase_expr.h | 6 +- be/src/exprs/vcompound_pred.h | 61 +-- be/src/format/arrow/arrow_stream_reader.cpp | 3 +- be/src/format/column_type_convert.cpp | 8 +- be/src/format/column_type_convert.h | 74 ++-- be/src/format/csv/csv_reader.cpp | 37 +- be/src/format/csv/csv_reader.h | 6 +- be/src/format/jni/jni_data_bridge.cpp | 41 +- be/src/format/json/new_json_reader.cpp | 64 +-- be/src/format/json/new_json_reader.h | 6 + be/src/format/lance/lance_rust_reader.cpp | 8 +- be/src/format/orc/vorc_reader.cpp | 124 ++++-- .../format/parquet/parquet_column_convert.h | 165 ++++++-- .../format/parquet/vparquet_column_reader.cpp | 22 +- .../format/parquet/vparquet_column_reader.h | 1 + .../format/parquet/vparquet_group_reader.cpp | 22 +- be/src/format/table/equality_delete.cpp | 8 +- be/src/format/table/iceberg_reader_mixin.h | 4 +- be/src/format/table/paimon_cpp_reader.cpp | 8 +- .../format/table/parquet_metadata_reader.cpp | 7 +- be/src/format/table/remote_doris_reader.cpp | 8 +- be/src/format/table/table_format_reader.h | 4 +- .../schema_active_queries_scanner.cpp | 1 + ...ma_authentication_integrations_scanner.cpp | 1 + .../schema_backend_active_tasks.cpp | 3 +- .../schema_backend_kerberos_ticket_cache.cpp | 1 + ...chema_catalog_meta_cache_stats_scanner.cpp | 1 + .../schema_database_properties_scanner.cpp | 1 + .../schema_file_cache_statistics.cpp | 1 + .../schema_partitions_scanner.cpp | 1 + .../schema_role_mappings_scanner.cpp | 1 + be/src/information_schema/schema_scanner.cpp | 35 +- .../schema_scanner_helper.cpp | 36 +- .../schema_sql_block_rule_status_scanner.cpp | 1 + .../schema_table_options_scanner.cpp | 1 + .../schema_table_properties_scanner.cpp | 1 + ...chema_table_stream_consumption_scanner.cpp | 3 +- .../schema_table_streams_scanner.cpp | 3 +- .../schema_view_dependency_scanner.cpp | 1 + .../schema_workload_group_privileges.cpp | 1 + ..._workload_group_resource_usage_scanner.cpp | 3 +- .../schema_workload_groups_scanner.cpp | 1 + .../schema_workload_sched_policy_scanner.cpp | 1 + be/src/load/memtable/memtable.cpp | 32 +- be/src/load/memtable/memtable.h | 6 +- be/src/runtime/query_cache/query_cache.cpp | 7 +- be/src/runtime/result_block_buffer.cpp | 6 +- be/src/service/point_query_executor.cpp | 173 ++++---- be/src/storage/iterator/block_reader.cpp | 5 +- .../iterator/vertical_block_reader.cpp | 18 +- be/src/storage/partial_update_info.cpp | 40 +- .../storage/schema_change/schema_change.cpp | 28 +- be/src/storage/segment/column_reader.cpp | 37 +- be/src/storage/segment/segment_writer.cpp | 10 +- be/src/storage/segment/segment_writer.h | 2 +- .../variant/binary_column_extract_iterator.h | 4 +- .../variant/hierarchical_data_iterator.cpp | 42 +- .../variant/hierarchical_data_iterator.h | 1 + .../segment/variant/variant_column_reader.cpp | 5 +- .../variant/variant_column_writer_impl.cpp | 18 +- .../variant_streaming_compaction_writer.cpp | 13 +- .../segment/vertical_segment_writer.cpp | 25 +- .../storage/segment/vertical_segment_writer.h | 2 +- be/src/storage/tablet/base_tablet.cpp | 16 +- be/src/util/jsonb/serialize.cpp | 51 ++- be/src/util/jsonb/serialize.h | 8 +- be/test/core/block/block_test.cpp | 25 ++ be/test/core/block/column_map_test.cpp | 116 ++++- be/test/core/block/column_nullable_test.cpp | 2 +- be/test/core/block/column_test.cpp | 19 + be/test/core/column/column_array_test.cpp | 27 +- be/test/core/column/column_ip_test.cpp | 98 ++--- be/test/core/column/column_nullable_test.cpp | 21 +- be/test/core/column/column_variant_test.cpp | 11 +- be/test/core/column/common_column_test.h | 12 +- .../data_type/common_data_type_serder_test.h | 2 +- be/test/core/data_type/complex_type_test.cpp | 115 +++++ .../data_type_serde_csv_test.cpp | 13 +- .../data_type_serde_struct_test.cpp | 7 +- be/test/core/jsonb/serialize_test.cpp | 40 ++ be/test/exec/column_type_convert_test.cpp | 126 ++---- .../exec/common/schema_util_rowset_test.cpp | 1 + be/test/exec/common/schema_util_test.cpp | 7 +- .../exec/connector/vjdbc_connector_test.cpp | 69 ++- be/test/exec/operator/agg_operator_test.cpp | 90 ++++ .../exec/operator/datagen_operator_test.cpp | 33 ++ be/test/exec/operator/set_operator_test.cpp | 50 ++- .../exprs/aggregate/agg_array_agg_test.cpp | 99 +++++ .../function_variant_element_test.cpp | 4 +- be/test/format/json/json_reader_test.cpp | 81 ++++ .../native/native_reader_writer_test.cpp | 1 + .../format/orc/orc_reader_fill_data_test.cpp | 40 +- .../parquet/parquet_column_convert_test.cpp | 177 ++++++++ .../format/parquet/parquet_thrift_test.cpp | 11 +- .../delta_writer_cluster_key_test.cpp | 3 +- .../load/delta_writer/delta_writer_test.cpp | 8 +- .../memtable/memtable_memory_limiter_test.cpp | 1 + be/test/runtime/snapshot_loader_test.cpp | 1 + be/test/runtime/stream_load_parquet_test.cpp | 32 +- .../adaptive_thread_pool_controller_test.cpp | 9 +- .../ordered_data_compaction_test.cpp | 2 + .../compaction/segcompaction_mow_test.cpp | 34 +- .../storage/compaction/segcompaction_test.cpp | 56 +-- .../compaction/vertical_compaction_test.cpp | 16 +- .../storage/index/date_bloom_filter_test.cpp | 2 + be/test/storage/index/index_builder_test.cpp | 36 ++ .../common/inverted_index_gc_binlogs_test.cpp | 1 + .../util/index_compaction_utils.cpp | 4 +- be/test/storage/rowid_conversion_test.cpp | 1 + .../storage/segment/segment_cache_test.cpp | 7 +- .../segments_key_bounds_truncation_test.cpp | 1 + .../variant_column_writer_reader_test.cpp | 168 ++------ be/test/storage/segment/variant_util_test.cpp | 34 ++ .../storage/tablet/tablet_cooldown_test.cpp | 5 +- be/test/util/bit_util_test.cpp | 32 +- docs/dev/be-cow-assume-mutable-audit.md | 397 ++++++++++++++++++ 178 files changed, 3507 insertions(+), 1354 deletions(-) create mode 100644 docs/dev/be-cow-assume-mutable-audit.md diff --git a/be/src/core/block/block.cpp b/be/src/core/block/block.cpp index 2bb156325443e3..9e3788a2adb040 100644 --- a/be/src/core/block/block.cpp +++ b/be/src/core/block/block.cpp @@ -581,7 +581,7 @@ MutableColumns Block::mutate_columns() { MutableColumns columns(num_columns); for (size_t i = 0; i < num_columns; ++i) { DCHECK(data[i].type); - columns[i] = data[i].column ? (*std::move(data[i].column)).mutate() + columns[i] = data[i].column ? IColumn::mutate(std::move(data[i].column)) : data[i].type->create_column(); } return columns; @@ -655,9 +655,26 @@ void Block::clear_column_data(int64_t column_size) noexcept { } for (auto& d : data) { if (d.column) { - // Temporarily disable reference count check because a column might be referenced multiple times within a block. - // Queries like this: `select c, c from t1;` - (*std::move(d.column)).assume_mutable()->clear(); + if (d.column->is_exclusive()) { + d.column->assume_mutable()->clear(); + } else { + d.column = d.column->clone_empty(); + } + } + } +} + +void Block::clear_column_data(const std::vector& columns_to_clear) noexcept { + SCOPED_SKIP_MEMORY_CHECK(); + for (auto col : columns_to_clear) { + DCHECK_LT(col, data.size()); + auto& column = data[col].column; + if (column) { + if (column->is_exclusive()) { + column->assume_mutable()->clear(); + } else { + column = column->clone_empty(); + } } } } @@ -1085,7 +1102,13 @@ void Block::shrink_char_type_column_suffix_zero(const std::vector& char_ for (auto idx : char_type_idx) { if (idx < data.size()) { auto& col_and_name = this->get_by_position(idx); - col_and_name.column->assume_mutable()->shrink_padding_chars(); + if (col_and_name.column->is_exclusive()) { + col_and_name.column->assume_mutable()->shrink_padding_chars(); + } else { + auto mutable_col = std::move(*col_and_name.column).mutate(); + mutable_col->shrink_padding_chars(); + col_and_name.column = std::move(mutable_col); + } } } } diff --git a/be/src/core/block/block.h b/be/src/core/block/block.h index 62186b36cced7e..535dc0ff286309 100644 --- a/be/src/core/block/block.h +++ b/be/src/core/block/block.h @@ -227,6 +227,7 @@ class Block { // Default column size = -1 means clear all column in block // Else clear column [0, column_size) delete column [column_size, data.size) void clear_column_data(int64_t column_size = -1) noexcept; + void clear_column_data(const std::vector& columns_to_clear) noexcept; MOCK_FUNCTION bool mem_reuse() { return !data.empty(); } diff --git a/be/src/core/column/column.cpp b/be/src/core/column/column.cpp index b0056e3d4377bd..3fea47f93887ec 100644 --- a/be/src/core/column/column.cpp +++ b/be/src/core/column/column.cpp @@ -232,10 +232,11 @@ bool is_column_const(const IColumn& column) { void IColumn::check_const_only_in_top_level() const { ColumnCallback throw_if_const = [&](WrappedPtr& column) { - if (is_column_const(*column)) { + const ColumnPtr& col = const_cast(column); + if (is_column_const(*col)) { throw doris::Exception(ErrorCode::INTERNAL_ERROR, "const column is not allowed to be nested, but got {}", - column->get_name()); + col->get_name()); } }; const_cast(this)->for_each_subcolumn(throw_if_const); diff --git a/be/src/core/column/column.h b/be/src/core/column/column.h index d20ecc9d820846..c48c7a55da84cd 100644 --- a/be/src/core/column/column.h +++ b/be/src/core/column/column.h @@ -581,16 +581,20 @@ class IColumn : public COW { MutablePtr mutate() const&& { MutablePtr res = shallow_mutate(); - res->for_each_subcolumn( - [](WrappedPtr& subcolumn) { subcolumn = std::move(*subcolumn).mutate(); }); + res->for_each_subcolumn([](WrappedPtr& subcolumn) { + static_cast(subcolumn) = + std::move(*static_cast(subcolumn)).mutate(); + }); return res; } static MutablePtr mutate(Ptr ptr) { MutablePtr res = ptr->shallow_mutate(); /// Now use_count is 2. ptr.reset(); /// Reset use_count to 1. - res->for_each_subcolumn( - [](WrappedPtr& subcolumn) { subcolumn = std::move(*subcolumn).mutate(); }); + res->for_each_subcolumn([](WrappedPtr& subcolumn) { + static_cast(subcolumn) = + std::move(*static_cast(subcolumn)).mutate(); + }); return res; } diff --git a/be/src/core/column/column_array.cpp b/be/src/core/column/column_array.cpp index 6de4d96cc326f7..e1d3e42e5451f0 100644 --- a/be/src/core/column/column_array.cpp +++ b/be/src/core/column/column_array.cpp @@ -47,6 +47,41 @@ class SipHash; namespace doris { +namespace { + +const ColumnArray::ColumnOffsets& check_array_offsets_column(const IColumn& offsets_column) { + const auto* offsets_concrete = typeid_cast(&offsets_column); + if (!offsets_concrete) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "offsets_column must be a ColumnUInt64"); + __builtin_unreachable(); + } + return *offsets_concrete; +} + +void validate_array_offsets(const IColumn& nested_column, const IColumn& offsets_column) { + const auto& offsets_concrete = check_array_offsets_column(offsets_column); + if (!offsets_concrete.empty()) { + auto last_offset = offsets_concrete.get_data().back(); + + /// This will also prevent possible overflow in offset. + if (nested_column.size() != last_offset) { + throw doris::Exception( + ErrorCode::INTERNAL_ERROR, + "nested_column's size {}, is not consistent with offsets_column's {}", + nested_column.size(), last_offset); + } + } +} + +void check_empty_array_data_without_offsets(const IColumn& nested_column) { + if (!nested_column.empty()) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Not empty data passed to ColumnArray, but no offsets passed"); + __builtin_unreachable(); + } +} + +} // namespace ColumnArray::ColumnArray(MutableColumnPtr&& nested_column, MutableColumnPtr&& offsets_column) : data(std::move(nested_column)), offsets(std::move(offsets_column)) { @@ -63,24 +98,8 @@ ColumnArray::ColumnArray(MutableColumnPtr&& nested_column, MutableColumnPtr&& of // } // #endif check_const_only_in_top_level(); - const auto* offsets_concrete = typeid_cast(offsets.get()); - - if (!offsets_concrete) { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, "offsets_column must be a ColumnUInt64"); - __builtin_unreachable(); - } - - if (!offsets_concrete->empty() && data) { - auto last_offset = offsets_concrete->get_data().back(); - - /// This will also prevent possible overflow in offset. - if (data->size() != last_offset) { - throw doris::Exception( - ErrorCode::INTERNAL_ERROR, - "nested_column's size {}, is not consistent with offsets_column's {}", - data->size(), last_offset); - } - } + validate_array_offsets(*static_cast(data), + *static_cast(offsets)); /** NOTE * Arrays with constant value are possible and used in implementation of higher order functions (see FunctionReplicate). @@ -90,14 +109,18 @@ ColumnArray::ColumnArray(MutableColumnPtr&& nested_column, MutableColumnPtr&& of ColumnArray::ColumnArray(MutableColumnPtr&& nested_column) : data(std::move(nested_column)) { data = data->convert_to_full_column_if_const(); - if (!data->empty()) { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, - "Not empty data passed to ColumnArray, but no offsets passed"); - __builtin_unreachable(); - } + check_empty_array_data_without_offsets(*data); offsets = ColumnOffsets::create(); } +ColumnArray::ColumnArray(SharedTag, ColumnPtr nested_column, ColumnPtr offsets_column) { + static_cast(data) = std::move(nested_column); + static_cast(offsets) = std::move(offsets_column); + check_const_only_in_top_level(); + validate_array_offsets(*static_cast(data), + *static_cast(offsets)); +} + void ColumnArray::shrink_padding_chars() { data->shrink_padding_chars(); } diff --git a/be/src/core/column/column_array.h b/be/src/core/column/column_array.h index c11547bdbf5e2d..dba4f046ec2350 100644 --- a/be/src/core/column/column_array.h +++ b/be/src/core/column/column_array.h @@ -75,6 +75,10 @@ class ColumnArray final : public COWHelper { /** Create an empty column of arrays with the type of values as in the column `nested_column` */ explicit ColumnArray(MutableColumnPtr&& nested_column); + /** Create an array column with shared (possibly non-exclusive) nested column and offsets. */ + struct SharedTag {}; + ColumnArray(SharedTag, ColumnPtr nested_column, ColumnPtr offsets_column); + ColumnArray(const ColumnArray&) = default; ColumnArray() = default; @@ -98,12 +102,21 @@ class ColumnArray final : public COWHelper { using Base = COWHelper; static MutablePtr create(const ColumnPtr& nested_column, const ColumnPtr& offsets_column) { - return ColumnArray::create(nested_column->assume_mutable(), - offsets_column->assume_mutable()); + // Construct with shared columns preserved (no cloning), as create(ColumnPtr) is designed + // to accept immutable/shared arguments per the COW contract. + return Base::create(SharedTag {}, nested_column, offsets_column); } static MutablePtr create(const ColumnPtr& nested_column) { - return ColumnArray::create(nested_column->assume_mutable()); + // Construct with shared columns preserved (no cloning), as create(ColumnPtr) is designed + // to accept immutable/shared arguments per the COW contract. + if (!nested_column->empty()) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Not empty data passed to ColumnArray, but no offsets passed"); + __builtin_unreachable(); + } + ColumnPtr empty_offsets = ColumnOffsets::create(); + return Base::create(SharedTag {}, nested_column, std::move(empty_offsets)); } template empty() != create_with_empty) { + const IColumn& col = get_data_column(); + if (col.empty() != create_with_empty) { throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Incorrect size of nested column in constructor of ColumnConst: {}, " "create_with_empty: {}.", - data->size(), create_with_empty); + col.size(), create_with_empty); } - if (data->size() != 1 && !create_with_empty) { + if (col.size() != 1 && !create_with_empty) { throw doris::Exception( ErrorCode::INTERNAL_ERROR, "Incorrect size of nested column in constructor of ColumnConst: {}, must be 1.", - data->size()); + col.size()); } } @@ -108,7 +109,10 @@ void ColumnConst::get_permutation(bool /*reverse*/, size_t /*limit*/, int /*nan_ } void ColumnConst::replace_float_special_values() { - data->replace_float_special_values(); + // COW: get exclusive ownership of data before mutating + auto mutable_data = IColumn::mutate(std::move(static_cast(data))); + mutable_data->replace_float_special_values(); + data = std::move(mutable_data); } std::pair check_column_const_set_readability(const IColumn& column, diff --git a/be/src/core/column/column_const.h b/be/src/core/column/column_const.h index 92a86628526384..7f648ece468dd1 100644 --- a/be/src/core/column/column_const.h +++ b/be/src/core/column/column_const.h @@ -240,7 +240,8 @@ class ColumnConst final : public COWHelper { bool has_enough_capacity(const IColumn& src) const override { return true; } int compare_at(size_t, size_t, const IColumn& rhs, int nan_direction_hint) const override { - auto rhs_const_column = assert_cast(rhs); + const auto& rhs_const_column = + assert_cast(rhs); const auto* this_nullable = check_and_get_column(data.get()); const auto* rhs_nullable = @@ -321,7 +322,11 @@ class ColumnConst final : public COWHelper { size_t deserialize_impl(const char* pos) override { ++s; - return data->deserialize_impl(pos); + ColumnPtr owned = std::move(static_cast(data)); + auto mutable_data = IColumn::mutate(std::move(owned)); + size_t ret = mutable_data->deserialize_impl(pos); + data = std::move(mutable_data); + return ret; } void replace_float_special_values() override; diff --git a/be/src/core/column/column_map.cpp b/be/src/core/column/column_map.cpp index 48db377d888b75..7ad4cb522ef9d5 100644 --- a/be/src/core/column/column_map.cpp +++ b/be/src/core/column/column_map.cpp @@ -41,40 +41,68 @@ class SipHash; namespace doris { +namespace { -/** A column of map values. - */ -std::string ColumnMap::get_name() const { - return "Map(" + keys_column->get_name() + ", " + values_column->get_name() + ")"; +const ColumnMap::COffsets& check_map_offsets_column(const IColumn& offsets_column) { + const auto* offsets_concrete = check_and_get_column(offsets_column); + if (!offsets_concrete) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "offsets_column must be a ColumnUInt64"); + __builtin_unreachable(); + } + return *offsets_concrete; } -ColumnMap::ColumnMap(MutableColumnPtr&& keys, MutableColumnPtr&& values, MutableColumnPtr&& offsets) - : keys_column(std::move(keys)), - values_column(std::move(values)), - offsets_column(std::move(offsets)) { - check_const_only_in_top_level(); - const auto* offsets_concrete = assert_cast(offsets_column.get()); +void validate_map_columns(const IColumn& keys, const IColumn& values, const IColumn& offsets) { + const auto& offsets_concrete = check_map_offsets_column(offsets); - if (!offsets_concrete->empty() && keys_column && values_column) { - auto last_offset = offsets_concrete->get_data().back(); + if (!offsets_concrete.empty()) { + auto last_offset = offsets_concrete.get_data().back(); /// This will also prevent possible overflow in offset. - if (keys_column->size() != last_offset) { + if (keys.size() != last_offset) { DCHECK(0); throw doris::Exception( doris::ErrorCode::INTERNAL_ERROR, "offsets_column size {} has data inconsistent with key_column {}", last_offset, - keys_column->size()); + keys.size()); } - if (values_column->size() != last_offset) { + if (values.size() != last_offset) { throw doris::Exception( doris::ErrorCode::INTERNAL_ERROR, "offsets_column size {} has data inconsistent with value_column {}", - last_offset, values_column->size()); + last_offset, values.size()); } } } +} // namespace + +/** A column of map values. + */ +std::string ColumnMap::get_name() const { + return "Map(" + keys_column->get_name() + ", " + values_column->get_name() + ")"; +} + +ColumnMap::ColumnMap(MutableColumnPtr&& keys, MutableColumnPtr&& values, MutableColumnPtr&& offsets) + : keys_column(std::move(keys)), + values_column(std::move(values)), + offsets_column(std::move(offsets)) { + check_const_only_in_top_level(); + validate_map_columns(*static_cast(keys_column), + *static_cast(values_column), + *static_cast(offsets_column)); +} + +ColumnMap::ColumnMap(SharedTag, ColumnPtr keys, ColumnPtr values, ColumnPtr offsets) { + static_cast(keys_column) = std::move(keys); + static_cast(values_column) = std::move(values); + static_cast(offsets_column) = std::move(offsets); + check_const_only_in_top_level(); + validate_map_columns(*static_cast(keys_column), + *static_cast(values_column), + *static_cast(offsets_column)); +} + // todo. here to resize every row map MutableColumnPtr ColumnMap::clone_resized(size_t to_size) const { auto res = ColumnMap::create(get_keys().clone_empty(), get_values().clone_empty(), @@ -518,35 +546,45 @@ void ColumnMap::insert_range_from_ignore_overflow(const IColumn& src, size_t sta } ColumnPtr ColumnMap::filter(const Filter& filt, ssize_t result_size_hint) const { - auto k_arr = - ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable()) - ->filter(filt, result_size_hint); - auto v_arr = - ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable()) - ->filter(filt, result_size_hint); + auto k_arr = ColumnArray::create(static_cast(keys_column), + static_cast(offsets_column)) + ->filter(filt, result_size_hint); + auto v_arr = ColumnArray::create(static_cast(values_column), + static_cast(offsets_column)) + ->filter(filt, result_size_hint); return ColumnMap::create(assert_cast(*k_arr).get_data_ptr(), assert_cast(*v_arr).get_data_ptr(), assert_cast(*k_arr).get_offsets_ptr()); } size_t ColumnMap::filter(const Filter& filter) { - MutableColumnPtr copied_off = offsets_column->clone_empty(); - copied_off->insert_range_from(*offsets_column, 0, offsets_column->size()); - ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable()) - ->filter(filter); - ColumnArray::create(values_column->assume_mutable(), copied_off->assume_mutable()) - ->filter(filter); - return get_offsets().size(); + // Move subcolumns out of this ColumnMap to get exclusive ownership, then write back. + auto keys_mut = IColumn::mutate(std::move(static_cast(keys_column))); + auto offsets_mut = IColumn::mutate(std::move(static_cast(offsets_column))); + auto values_mut = IColumn::mutate(std::move(static_cast(values_column))); + // Clone offsets for values (both keys and values share the same offsets structure) + MutableColumnPtr copied_off = offsets_mut->clone_empty(); + copied_off->insert_range_from(*offsets_mut, 0, offsets_mut->size()); + auto k_arr = ColumnArray::create(std::move(keys_mut), std::move(offsets_mut)); + k_arr->filter(filter); + auto v_arr = ColumnArray::create(std::move(values_mut), std::move(copied_off)); + v_arr->filter(filter); + // Put filtered subcolumns back + static_cast(keys_column) = k_arr->get_data_ptr(); + static_cast(offsets_column) = k_arr->get_offsets_ptr(); + static_cast(values_column) = v_arr->get_data_ptr(); + // Use const access to avoid assume_mutable_ref() on the just-written-back offsets_column + // (k_arr still holds a ref, so use_count > 1 until k_arr goes out of scope) + return static_cast(offsets_column)->size(); } MutableColumnPtr ColumnMap::permute(const Permutation& perm, size_t limit) const { - // Make a temp column array - auto k_arr = - ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable()) - ->permute(perm, limit); - auto v_arr = - ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable()) - ->permute(perm, limit); + auto k_arr = ColumnArray::create(static_cast(keys_column), + static_cast(offsets_column)) + ->permute(perm, limit); + auto v_arr = ColumnArray::create(static_cast(values_column), + static_cast(offsets_column)) + ->permute(perm, limit); return ColumnMap::create(assert_cast(*k_arr).get_data_ptr(), assert_cast(*v_arr).get_data_ptr(), @@ -554,23 +592,38 @@ MutableColumnPtr ColumnMap::permute(const Permutation& perm, size_t limit) const } Status ColumnMap::deduplicate_keys(bool recursive) { - const auto inner_rows = keys_column->size(); - const auto rows = offsets_column->size(); + const IColumn& ck = *static_cast(keys_column); + const IColumn& co = *static_cast(offsets_column); + const auto inner_rows = ck.size(); + const auto rows = co.size(); if (recursive) { - auto values_column_ = values_column; - if (values_column_->is_nullable()) { - values_column_ = (assert_cast(*values_column)).get_nested_column_ptr(); - } - - if (auto* values_map = check_and_get_column(values_column_.get())) { - RETURN_IF_ERROR(values_map->deduplicate_keys(recursive)); + const auto& values_ptr = static_cast(values_column); + if (const auto* nullable_values = check_and_get_column(values_ptr.get())) { + if (check_and_get_column(nullable_values->get_nested_column_ptr().get())) { + auto values_mut = + IColumn::mutate(std::move(static_cast(values_column))); + auto& nullable_values_mut = assert_cast(*values_mut); + auto nested_values_mut = + IColumn::mutate(static_cast(nullable_values_mut) + .get_nested_column_ptr()); + auto& nested_values_map = assert_cast(*nested_values_mut); + RETURN_IF_ERROR(nested_values_map.deduplicate_keys(recursive)); + ColumnPtr nested_values_ptr = std::move(nested_values_mut); + nullable_values_mut.change_nested_column(nested_values_ptr); + static_cast(values_column) = std::move(values_mut); + } + } else if (check_and_get_column(values_ptr.get())) { + auto values_mut = IColumn::mutate(std::move(static_cast(values_column))); + auto& values_map = assert_cast(*values_mut); + RETURN_IF_ERROR(values_map.deduplicate_keys(recursive)); + static_cast(values_column) = std::move(values_mut); } } DorisVector serialized_keys(inner_rows); - const size_t max_one_row_byte_size = keys_column->get_max_row_byte_size(); + const size_t max_one_row_byte_size = ck.get_max_row_byte_size(); size_t total_bytes = max_one_row_byte_size * inner_rows; Arena pool; @@ -579,7 +632,7 @@ Status ColumnMap::deduplicate_keys(bool recursive) { // reach mem limit, don't serialize in batch const char* begin = nullptr; for (size_t i = 0; i != inner_rows; ++i) { - serialized_keys[i] = keys_column->serialize_value_into_arena(i, pool, begin); + serialized_keys[i] = ck.serialize_value_into_arena(i, pool, begin); } } else { auto* serialized_key_buffer = reinterpret_cast(pool.alloc(total_bytes)); @@ -590,7 +643,7 @@ Status ColumnMap::deduplicate_keys(bool recursive) { serialized_keys[i].size = 0; } - keys_column->serialize(serialized_keys.data(), inner_rows); + ck.serialize(serialized_keys.data(), inner_rows); } auto new_offsets = COffsets::create(); @@ -598,7 +651,7 @@ Status ColumnMap::deduplicate_keys(bool recursive) { auto& new_offsets_data = new_offsets->get_data(); IColumn::Filter filter(inner_rows, 1); - auto& offsets = get_offsets(); + const auto& offsets = static_cast(this)->get_offsets(); Offset64 offset = 0; bool has_duplicated_key = false; @@ -636,8 +689,12 @@ Status ColumnMap::deduplicate_keys(bool recursive) { if (has_duplicated_key) { offsets_column = std::move(new_offsets); - keys_column->filter(filter); - values_column->filter(filter); + auto keys_mut = IColumn::mutate(std::move(static_cast(keys_column))); + keys_mut->filter(filter); + static_cast(keys_column) = std::move(keys_mut); + auto values_mut = IColumn::mutate(std::move(static_cast(values_column))); + values_mut->filter(filter); + static_cast(values_column) = std::move(values_mut); } return Status::OK(); diff --git a/be/src/core/column/column_map.h b/be/src/core/column/column_map.h index 12f8fe4f8184ab..25ce7cfbbd4c2e 100644 --- a/be/src/core/column/column_map.h +++ b/be/src/core/column/column_map.h @@ -58,11 +58,11 @@ class ColumnMap final : public COWHelper { */ using Base = COWHelper; using COffsets = ColumnArray::ColumnOffsets; + struct SharedTag {}; static MutablePtr create(const ColumnPtr& keys, const ColumnPtr& values, const ColumnPtr& offsets) { - return ColumnMap::create(keys->assume_mutable(), values->assume_mutable(), - offsets->assume_mutable()); + return Base::create(SharedTag {}, keys, values, offsets); } template { WrappedPtr offsets_column; // offset ColumnMap(MutableColumnPtr&& keys, MutableColumnPtr&& values, MutableColumnPtr&& offsets); + ColumnMap(SharedTag, ColumnPtr keys, ColumnPtr values, ColumnPtr offsets); ColumnMap(const ColumnMap&) = default; }; diff --git a/be/src/core/column/column_nullable.cpp b/be/src/core/column/column_nullable.cpp index 95b186fe894b69..12ee5aff6d5740 100644 --- a/be/src/core/column/column_nullable.cpp +++ b/be/src/core/column/column_nullable.cpp @@ -28,6 +28,30 @@ #include "exec/sort/sort_block.h" namespace doris { +namespace { + +const ColumnUInt8& check_nullable_null_map_column(const IColumn& null_map) { + const auto* concrete = check_and_get_column(null_map); + if (!concrete) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "ColumnNullable null map must be ColumnUInt8, but got {}", + null_map.get_name()); + __builtin_unreachable(); + } + return *concrete; +} + +void check_nullable_sizes(const IColumn& nested_column, const IColumn& null_map) { + const auto& null_map_concrete = check_nullable_null_map_column(null_map); + if (nested_column.size() != null_map_concrete.size()) { + throw doris::Exception( + ErrorCode::INTERNAL_ERROR, + "Size of nested column {} with size {} is not equal to size of null map {}", + nested_column.get_name(), nested_column.size(), null_map_concrete.size()); + } +} + +} // namespace ColumnNullable::ColumnNullable(MutableColumnPtr&& nested_column_, MutableColumnPtr&& null_map_) : _nested_column(std::move(nested_column_)), _null_map(std::move(null_map_)) { @@ -45,6 +69,33 @@ ColumnNullable::ColumnNullable(MutableColumnPtr&& nested_column_, MutableColumnP "ColumnNullable cannot have constant null map"); __builtin_unreachable(); } + check_nullable_sizes(*static_cast(_nested_column), + *static_cast(_null_map)); +} + +ColumnNullable::ColumnNullable(SharedTag, ColumnPtr nested_column_, ColumnPtr null_map_) { + check_nullable_sizes(*nested_column_, *null_map_); + + if (const auto* nullable_nested = check_and_get_column(nested_column_.get())) { + auto merged_null_map = null_map_->clone_empty(); + merged_null_map->insert_range_from(*null_map_, 0, null_map_->size()); + auto& merged_null_map_data = assert_cast(*merged_null_map).get_data(); + const auto& nested_null_map_data = nullable_nested->get_null_map_data(); + DCHECK_EQ(merged_null_map_data.size(), nested_null_map_data.size()); + for (size_t i = 0; i != merged_null_map_data.size(); ++i) { + merged_null_map_data[i] |= nested_null_map_data[i]; + } + + static_cast(_nested_column) = nullable_nested->get_nested_column_ptr(); + static_cast(_null_map) = std::move(merged_null_map); + } else { + static_cast(_nested_column) = std::move(nested_column_); + static_cast(_null_map) = std::move(null_map_); + } + + check_const_only_in_top_level(); + check_nullable_sizes(*static_cast(_nested_column), + *static_cast(_null_map)); } void ColumnNullable::shrink_padding_chars() { @@ -113,7 +164,14 @@ void ColumnNullable::update_crc32c_batch(uint32_t* __restrict hashes, const auto* __restrict real_null_data = get_null_map_column().get_data().data(); if (_nested_column->support_replace_column_null_data()) { // nullmap process is slow, replace null data to default value to avoid nullmap process - _nested_column->assume_mutable()->replace_column_null_data(real_null_data); + // This is an intentional in-place mutation inside a logically-const hash computation: + // null positions are overwritten with defaults so the inner hash loop needs no null checks. + // The invariant is that a column instance is not hashed concurrently through the same + // owner while this per-block hash path runs. Shared aliases are detached by mutate() + // before this normalized nested column is written back. + auto nested_mut = std::move(*static_cast(_nested_column)).mutate(); + nested_mut->replace_column_null_data(real_null_data); + static_cast(const_cast(_nested_column)) = std::move(nested_mut); _nested_column->update_crc32c_batch(hashes, nullptr); } else { auto s = size(); @@ -373,12 +431,15 @@ size_t ColumnNullable::filter(const Filter& filter) { Status ColumnNullable::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) { auto* nullable_col_ptr = assert_cast(col_ptr); - WrappedPtr nest_col_ptr = nullable_col_ptr->_nested_column; + // Access the nested column via const path to avoid assume_mutable_ref (which requires + // exclusive ownership). The output col_ptr was just created, so its nested column is exclusive. + IColumn* nest_col_raw = const_cast( + static_cast(nullable_col_ptr->_nested_column).get()); /// `get_null_map_data` will set `_need_update_has_null` to true auto& res_nullmap = nullable_col_ptr->get_null_map_data(); - RETURN_IF_ERROR(get_nested_column().filter_by_selector(sel, sel_size, nest_col_ptr.get())); + RETURN_IF_ERROR(get_nested_column().filter_by_selector(sel, sel_size, nest_col_raw)); DCHECK(res_nullmap.empty()); res_nullmap.resize(sel_size); auto& cur_nullmap = get_null_map_column().get_data(); diff --git a/be/src/core/column/column_nullable.h b/be/src/core/column/column_nullable.h index a31df0937d2b61..01cdddf776effe 100644 --- a/be/src/core/column/column_nullable.h +++ b/be/src/core/column/column_nullable.h @@ -55,6 +55,8 @@ class ColumnNullable final : public COWHelper { friend class COWHelper; ColumnNullable(MutableColumnPtr&& nested_column_, MutableColumnPtr&& null_map_); + struct SharedTag {}; + ColumnNullable(SharedTag, ColumnPtr nested_column_, ColumnPtr null_map_); ColumnNullable(const ColumnNullable&) = default; public: @@ -63,8 +65,7 @@ class ColumnNullable final : public COWHelper { */ using Base = COWHelper; static MutablePtr create(const ColumnPtr& nested_column_, const ColumnPtr& null_map_) { - return ColumnNullable::create(nested_column_->assume_mutable(), - null_map_->assume_mutable()); + return Base::create(SharedTag {}, nested_column_, null_map_); } template diff --git a/be/src/core/column/column_varbinary.h b/be/src/core/column/column_varbinary.h index 673059194face5..caad77e28ad44f 100644 --- a/be/src/core/column/column_varbinary.h +++ b/be/src/core/column/column_varbinary.h @@ -44,7 +44,12 @@ class ColumnVarbinary final : public COWHelper { private: ColumnVarbinary() = default; ColumnVarbinary(const size_t n) : _data(n) {} - ColumnVarbinary(const ColumnVarbinary& src) : _data(src._data.begin(), src._data.end()) {} + ColumnVarbinary(const ColumnVarbinary& src) { + _data.reserve(src._data.size()); + for (const auto& value : src._data) { + insert_data(value.data(), value.size()); + } + } public: std::string get_name() const override { return "ColumnVarbinary"; } diff --git a/be/src/core/column/column_variant.cpp b/be/src/core/column/column_variant.cpp index 2ab04c80b861a8..37921e26989f86 100644 --- a/be/src/core/column/column_variant.cpp +++ b/be/src/core/column/column_variant.cpp @@ -484,7 +484,7 @@ MutableColumnPtr ColumnVariant::apply_for_columns(Func&& func) const { auto& finalized_object = assert_cast(*finalized); return finalized_object.apply_for_columns(std::forward(func)); } - auto new_root = func(get_root())->assume_mutable(); + auto new_root = std::move(*func(get_root())).mutate(); auto res = ColumnVariant::create(_max_subcolumns_count, _enable_doc_mode, get_root_type(), std::move(new_root)); for (const auto& subcolumn : subcolumns) { @@ -492,16 +492,16 @@ MutableColumnPtr ColumnVariant::apply_for_columns(Func&& func) const { continue; } auto new_subcolumn = func(subcolumn->data.get_finalized_column_ptr()); - if (!res->add_sub_column(subcolumn->path, new_subcolumn->assume_mutable(), + if (!res->add_sub_column(subcolumn->path, std::move(*new_subcolumn).mutate(), subcolumn->data.get_least_common_type())) { throw doris::Exception(ErrorCode::INTERNAL_ERROR, "add path {} is error", subcolumn->path.get_path()); } } auto sparse_column = func(serialized_sparse_column); - res->serialized_sparse_column = sparse_column->assume_mutable(); + res->serialized_sparse_column = IColumn::mutate(std::move(sparse_column)); auto doc_value_column = func(serialized_doc_value_column); - res->serialized_doc_value_column = doc_value_column->assume_mutable(); + res->serialized_doc_value_column = IColumn::mutate(std::move(doc_value_column)); res->num_rows = res->serialized_sparse_column->size(); ENABLE_CHECK_CONSISTENCY(res.get()); return res; @@ -942,6 +942,10 @@ bool ColumnVariant::Subcolumn::is_null_at(size_t n) const { } ind -= part->size(); } + // Remaining rows are pending lazy defaults (current_num_of_defaults suffix). + if (ind < current_num_of_defaults) { + return true; + } throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting field is out of range", n); } @@ -972,6 +976,11 @@ void ColumnVariant::Subcolumn::get(size_t n, FieldWithDataType& res) const { ind -= part->size(); } + // Remaining rows are pending lazy defaults (current_num_of_defaults suffix). + if (ind < current_num_of_defaults) { + res = FieldWithDataType(Field()); + return; + } throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Index ({}) for getting field is out of range", n); } @@ -2059,14 +2068,13 @@ Status ColumnVariant::serialize_sparse_columns( /// directly as NestedGroup data by the writer (VariantColumnWriterImpl). void ColumnVariant::unnest(Subcolumns::NodePtr& entry, Subcolumns& res_subcolumns) const { entry->data.finalize(); - auto nested_column = entry->data.get_finalized_column_ptr()->assume_mutable(); + auto nested_column = std::move(*entry->data.get_finalized_column_ptr()).mutate(); auto* nested_column_nullable = assert_cast(nested_column.get()); auto* nested_column_array = - assert_cast(nested_column_nullable->get_nested_column_ptr().get()); + assert_cast(&nested_column_nullable->get_nested_column()); auto& offset = nested_column_array->get_offsets_ptr(); - auto* nested_object_nullable = assert_cast( - nested_column_array->get_data_ptr()->assume_mutable().get()); + auto* nested_object_nullable = assert_cast(&nested_column_array->get_data()); auto& nested_object_column = assert_cast(nested_object_nullable->get_nested_column()); PathInData nested_path = entry->path; @@ -2082,13 +2090,18 @@ void ColumnVariant::unnest(Subcolumns::NodePtr& entry, Subcolumns& res_subcolumn path_builder.append(nested_entry->path.get_parts(), true); auto subnested_column = ColumnArray::create( ColumnNullable::create(nested_entry->data.get_finalized_column_ptr(), - nested_object_nullable->get_null_map_column_ptr()), + static_cast(nested_object_nullable) + ->get_null_map_column() + .get_ptr()), offset); - auto nullable_subnested_column = ColumnNullable::create( - std::move(subnested_column), nested_column_nullable->get_null_map_column_ptr()); + auto nullable_subnested_column = + ColumnNullable::create(std::move(subnested_column), + static_cast(nested_column_nullable) + ->get_null_map_column() + .get_ptr()); auto type = make_nullable( std::make_shared(nested_entry->data.least_common_type.get())); - Subcolumn subcolumn(nullable_subnested_column->assume_mutable(), type, is_nullable); + Subcolumn subcolumn(std::move(nullable_subnested_column), type, is_nullable); res_subcolumns.add(path_builder.build(), subcolumn); } } @@ -2101,7 +2114,24 @@ void ColumnVariant::clear_sparse_column() { } #endif - serialized_sparse_column->clear(); + serialized_sparse_column = ColumnPtr(create_binary_column_fn()); +} + +void ColumnVariant::ensure_binary_columns_rows() { + auto resize_if_empty = [this](WrappedPtr& column) { + const auto& const_column = static_cast(column); + if (const_column->size() == num_rows) { + return; + } + CHECK(const_column->empty()) + << "ColumnVariant binary column size mismatch, rows: " << num_rows + << ", column rows: " << const_column->size(); + auto mutable_column = IColumn::mutate(std::move(static_cast(column))); + mutable_column->resize(num_rows); + column = std::move(mutable_column); + }; + resize_if_empty(serialized_sparse_column); + resize_if_empty(serialized_doc_value_column); } Status ColumnVariant::convert_typed_path_to_storage_type( @@ -2216,6 +2246,7 @@ Status ColumnVariant::pick_subcolumns_to_sparse_column( } void ColumnVariant::finalize(FinalizeMode mode) { + ensure_binary_columns_rows(); if (is_finalized() && mode == FinalizeMode::READ_MODE) { _prev_positions.clear(); ENABLE_CHECK_CONSISTENCY(this); @@ -2263,6 +2294,7 @@ void ColumnVariant::finalize(FinalizeMode mode) { std::swap(subcolumns, new_subcolumns); _prev_positions.clear(); + ensure_binary_columns_rows(); ENABLE_CHECK_CONSISTENCY(this); } @@ -2313,7 +2345,7 @@ ColumnPtr ColumnVariant::filter(const Filter& filter, ssize_t count) const { ENABLE_CHECK_CONSISTENCY(res.get()); return res; } - auto new_root = get_root()->filter(filter, count)->assume_mutable(); + auto new_root = std::move(*get_root()->filter(filter, count)).mutate(); auto new_column = ColumnVariant::create(_max_subcolumns_count, _enable_doc_mode, get_root_type(), std::move(new_root)); for (const auto& entry : subcolumns) { @@ -2321,7 +2353,7 @@ ColumnPtr ColumnVariant::filter(const Filter& filter, ssize_t count) const { continue; } auto subcolumn = entry->data.get_finalized_column().filter(filter, -1); - new_column->add_sub_column(entry->path, subcolumn->assume_mutable(), + new_column->add_sub_column(entry->path, std::move(*subcolumn).mutate(), entry->data.get_least_common_type()); } new_column->serialized_sparse_column = serialized_sparse_column->filter(filter, count); @@ -2368,8 +2400,10 @@ void ColumnVariant::clear() { // we must keep root column exist empty.create_root(Subcolumn(0, is_nullable, true)); std::swap(empty, subcolumns); - serialized_sparse_column->clear(); - serialized_doc_value_column->clear(); + // Reassign to fresh empty columns to avoid requiring exclusive ownership. + // The existing columns may be shared (use_count > 1) so we cannot clear them in-place. + serialized_sparse_column = ColumnPtr(create_binary_column_fn()); + serialized_doc_value_column = ColumnPtr(create_binary_column_fn()); num_rows = 0; _prev_positions.clear(); ENABLE_CHECK_CONSISTENCY(this); @@ -2769,10 +2803,26 @@ void ColumnVariant::fill_path_column_from_sparse_data(Subcolumn& subcolumn, Null MutableColumnPtr ColumnVariant::clone() const { auto res = ColumnVariant::create(_max_subcolumns_count, _enable_doc_mode); + // Copy typed_path_count and nested_path_count so the subcolumn limit logic is consistent. + res->typed_path_count = typed_path_count; + res->nested_path_count = nested_path_count; Subcolumns new_subcolumns; for (const auto& subcolumn : subcolumns) { - auto new_subcolumn = subcolumn->data; - if (subcolumn->data.is_root) { + // Struct-copy all metadata (num_rows, num_of_defaults_in_prefix, + // current_num_of_defaults, data_types, etc.), then deep-clone data WrappedPtrs. + Subcolumn new_subcolumn = subcolumn->data; + for (auto& wp : new_subcolumn.data) { + static_cast(wp) = + std::move(*static_cast(wp)).mutate(); + } + // Flush pending lazy defaults into actual data so that the cloned subcolumn + // is self-consistent (current_num_of_defaults == 0 after clone). + if (new_subcolumn.current_num_of_defaults > 0) { + size_t pending = new_subcolumn.current_num_of_defaults; + new_subcolumn.current_num_of_defaults = 0; + new_subcolumn.insert_many_defaults(pending); + } + if (subcolumn->data.is_root || subcolumn->path.empty()) { new_subcolumns.create_root(std::move(new_subcolumn)); } else if (!new_subcolumns.add(subcolumn->path, std::move(new_subcolumn))) { throw doris::Exception(ErrorCode::INTERNAL_ERROR, "add path {} is error in clone()", @@ -2783,13 +2833,8 @@ MutableColumnPtr ColumnVariant::clone() const { throw doris::Exception(ErrorCode::INTERNAL_ERROR, "root is nullptr in clone()"); } res->subcolumns = std::move(new_subcolumns); - auto&& column = serialized_sparse_column->get_ptr(); - auto sparse_column = std::move(*column).mutate(); - res->serialized_sparse_column = sparse_column->assume_mutable(); - - auto&& new_doc_value_column = serialized_doc_value_column->get_ptr(); - auto doc_value_column = std::move(*new_doc_value_column).mutate(); - res->serialized_doc_value_column = doc_value_column->assume_mutable(); + res->serialized_sparse_column = IColumn::mutate(serialized_sparse_column->get_ptr()); + res->serialized_doc_value_column = IColumn::mutate(serialized_doc_value_column->get_ptr()); res->set_num_rows(num_rows); ENABLE_CHECK_CONSISTENCY(res.get()); diff --git a/be/src/core/column/column_variant.h b/be/src/core/column/column_variant.h index 16ced2f529118f..1ae92afd54cccc 100644 --- a/be/src/core/column/column_variant.h +++ b/be/src/core/column/column_variant.h @@ -325,7 +325,7 @@ class ColumnVariant final : public COWHelper { if (subcolumns.empty()) { return nullptr; } - return subcolumns.get_mutable_root()->data.get_finalized_column_ptr()->assume_mutable(); + return std::move(*subcolumns.get_mutable_root()->data.get_finalized_column_ptr()).mutate(); } void serialize_one_row_to_string(int64_t row, std::string* output, @@ -354,6 +354,8 @@ class ColumnVariant final : public COWHelper { void clear_sparse_column(); + void ensure_binary_columns_rows(); + // root is null or type nothing bool is_null_root() const; @@ -409,8 +411,12 @@ class ColumnVariant final : public COWHelper { ColumnPtr get_sparse_column() const { return serialized_sparse_column; } + IColumn& get_sparse_column_mutable() { return *serialized_sparse_column; } + ColumnPtr get_doc_value_column() const { return serialized_doc_value_column; } + IColumn& get_doc_value_column_mutable() { return *serialized_doc_value_column; } + // use sparse_subcolumns_schema to record sparse column's path info and type static MutableColumnPtr create_binary_column_fn() { return ColumnMap::create(ColumnString::create(), ColumnString::create(), diff --git a/be/src/core/cow.h b/be/src/core/cow.h index fcac631aa83ce1..a0dd93bf545d20 100644 --- a/be/src/core/cow.h +++ b/be/src/core/cow.h @@ -25,6 +25,9 @@ #include #include +#include "common/exception.h" +#include "common/status.h" + namespace doris { /** Copy-on-write shared ptr. @@ -313,9 +316,19 @@ class COW { public: MutablePtr mutate() const&& { return shallow_mutate(); } - MutablePtr assume_mutable() const { return const_cast(this)->get_ptr(); } + MutablePtr assume_mutable() const { + if (this->use_count() > 1) { + throw Exception(ErrorCode::INTERNAL_ERROR, "COW::assume_mutable: use_count() > 1"); + } + return const_cast(this)->get_ptr(); + } - Derived& assume_mutable_ref() const { return const_cast(*derived()); } + Derived& assume_mutable_ref() const { + if (this->use_count() > 1) { + throw Exception(ErrorCode::INTERNAL_ERROR, "COW::assume_mutable: use_count() > 1"); + } + return const_cast(*derived()); + } protected: /// It works as immutable_ptr if it is const and as mutable_ptr if it is non const. diff --git a/be/src/core/data_type/data_type_array.cpp b/be/src/core/data_type/data_type_array.cpp index 1c0ef786e77ea0..b0be7d2a51c6fe 100644 --- a/be/src/core/data_type/data_type_array.cpp +++ b/be/src/core/data_type/data_type_array.cpp @@ -127,8 +127,9 @@ const char* DataTypeArray::deserialize(const char* buf, MutableColumnPtr* column memcpy(offsets.data(), buf, sizeof(ColumnArray::Offset64) * real_have_saved_num); buf += sizeof(ColumnArray::Offset64) * real_have_saved_num; // children - auto nested_column = data_column->get_data_ptr()->assume_mutable(); + auto nested_column = std::move(*data_column->get_data_ptr()).mutate(); buf = get_nested_type()->deserialize(buf, &nested_column, be_exec_version); + data_column->get_data_ptr() = std::move(nested_column); return buf; } diff --git a/be/src/core/data_type/data_type_map.cpp b/be/src/core/data_type/data_type_map.cpp index 0932bf47c218bd..043fd7a70248f3 100644 --- a/be/src/core/data_type/data_type_map.cpp +++ b/be/src/core/data_type/data_type_map.cpp @@ -135,10 +135,12 @@ const char* DataTypeMap::deserialize(const char* buf, MutableColumnPtr* column, memcpy(map_offsets.data(), buf, sizeof(ColumnArray::Offset64) * real_have_saved_num); buf += sizeof(ColumnArray::Offset64) * real_have_saved_num; // key value - auto nested_keys_column = map_column->get_keys_ptr()->assume_mutable(); - auto nested_values_column = map_column->get_values_ptr()->assume_mutable(); + auto nested_keys_column = std::move(*map_column->get_keys_ptr()).mutate(); + auto nested_values_column = std::move(*map_column->get_values_ptr()).mutate(); buf = get_key_type()->deserialize(buf, &nested_keys_column, be_exec_version); buf = get_value_type()->deserialize(buf, &nested_values_column, be_exec_version); + map_column->get_keys_ptr() = std::move(nested_keys_column); + map_column->get_values_ptr() = std::move(nested_values_column); return buf; } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/core/data_type/data_type_struct.cpp b/be/src/core/data_type/data_type_struct.cpp index 44cdadd3e98387..ed1e56c51c5365 100644 --- a/be/src/core/data_type/data_type_struct.cpp +++ b/be/src/core/data_type/data_type_struct.cpp @@ -214,8 +214,9 @@ const char* DataTypeStruct::deserialize(const char* buf, MutableColumnPtr* colum auto* struct_column = assert_cast(origin_column); DCHECK(elems.size() == struct_column->tuple_size()); for (size_t i = 0; i < elems.size(); ++i) { - auto child_column = struct_column->get_column_ptr(i)->assume_mutable(); + auto child_column = std::move(*struct_column->get_column_ptr(i)).mutate(); buf = elems[i]->deserialize(buf, &child_column, be_exec_version); + struct_column->get_column_ptr(i) = std::move(child_column); } return buf; } diff --git a/be/src/exec/common/arrow_column_to_doris_column.cpp b/be/src/exec/common/arrow_column_to_doris_column.cpp index cd6e959596791b..645376ee12d7a7 100644 --- a/be/src/exec/common/arrow_column_to_doris_column.cpp +++ b/be/src/exec/common/arrow_column_to_doris_column.cpp @@ -100,10 +100,12 @@ Status arrow_column_to_doris_column(const arrow::Array* arrow_column, size_t arr Status arrow_column_to_doris_column(const arrow::Array* arrow_column, size_t arrow_batch_cur_idx, ColumnPtr& doris_column, const DataTypePtr& type, size_t num_elements, const cctz::time_zone& ctz) { - RETURN_IF_ERROR(type->get_serde()->read_column_from_arrow( - doris_column->assume_mutable_ref(), arrow_column, arrow_batch_cur_idx, - arrow_batch_cur_idx + num_elements, ctz)); - return Status::OK(); + auto mutable_column = IColumn::mutate(std::move(doris_column)); + auto status = type->get_serde()->read_column_from_arrow( + *mutable_column, arrow_column, arrow_batch_cur_idx, arrow_batch_cur_idx + num_elements, + ctz); + doris_column = std::move(mutable_column); + return status; } } // namespace doris diff --git a/be/src/exec/common/data_gen_functions/vnumbers_tvf.cpp b/be/src/exec/common/data_gen_functions/vnumbers_tvf.cpp index d1d4f91270f409..f36afc9f611b25 100644 --- a/be/src/exec/common/data_gen_functions/vnumbers_tvf.cpp +++ b/be/src/exec/common/data_gen_functions/vnumbers_tvf.cpp @@ -49,7 +49,7 @@ Status VNumbersTVF::get_next(RuntimeState* state, Block* block, bool* eos) { // now only support one column for tvf numbers for (int i = 0; i < _slot_num; ++i) { if (mem_reuse) { - columns[i] = std::move(*(block->get_by_position(i).column)).mutate(); + columns[i] = IColumn::mutate(std::move(block->get_by_position(i).column)); } else { columns[i] = _tuple_desc->slots()[i]->get_empty_mutable_column(); } @@ -73,7 +73,7 @@ Status VNumbersTVF::get_next(RuntimeState* state, Block* block, bool* eos) { } if (mem_reuse) { - columns.clear(); + block->set_columns(std::move(columns)); } else { size_t n_columns = 0; for (const auto* slot_desc : _tuple_desc->slots()) { diff --git a/be/src/exec/common/hash_table/hash_map_context.h b/be/src/exec/common/hash_table/hash_map_context.h index ebd303c66c2b16..5e590ac7789109 100644 --- a/be/src/exec/common/hash_table/hash_map_context.h +++ b/be/src/exec/common/hash_table/hash_map_context.h @@ -955,7 +955,7 @@ struct MethodKeysFixed : public MethodBase { const auto* nullmap = assert_cast(*nullmap_columns[j]).get_data().data(); // make sure null cell is filled by 0x0 - key_columns[j]->assume_mutable()->replace_column_null_data(nullmap); + const_cast(key_columns[j])->replace_column_null_data(nullmap); } auto* __restrict current = result_data + offset; for (size_t i = 0; i < row_numbers; ++i) { diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index 39e8f236ecd16e..6008e3ac2bff51 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -355,7 +355,7 @@ Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, Co auto variant = ColumnVariant::create(data_type_object.variant_max_subcolumns_count(), data_type_object.enable_doc_mode()); - variant->create_root(arg.type, arg.column->assume_mutable()); + variant->create_root(arg.type, std::move(*arg.column).mutate()); ColumnPtr nullable = ColumnNullable::create( variant->get_ptr(), assert_cast(arg.column.get())->get_null_map_column_ptr()); @@ -2048,9 +2048,8 @@ void parse_json_to_variant_impl(IColumn& column, const char* src, size_t length, } } column_variant.incr_num_rows(); - auto sparse_column = column_variant.get_sparse_column(); - if (sparse_column->size() == old_num_rows) { - sparse_column->assume_mutable()->insert_default(); + if (column_variant.get_sparse_column()->size() == old_num_rows) { + column_variant.get_sparse_column_mutable().insert_default(); } #ifndef NDEBUG column_variant.check_consistency(); @@ -2147,10 +2146,15 @@ Status _parse_and_materialize_variant_columns(Block& block, for (size_t i = 0; i < variant_pos.size(); ++i) { auto column_ref = block.get_by_position(variant_pos[i]).column; bool is_nullable = column_ref->is_nullable(); - MutableColumnPtr var_column = column_ref->assume_mutable(); + MutableColumnPtr owner_column = std::move(*column_ref).mutate(); + ColumnPtr nullable_null_map; + MutableColumnPtr var_column; if (is_nullable) { - const auto& nullable = assert_cast(*column_ref); - var_column = nullable.get_nested_column_ptr()->assume_mutable(); + const auto& nullable = assert_cast(*owner_column); + nullable_null_map = nullable.get_null_map_column_ptr(); + var_column = std::move(*nullable.get_nested_column_ptr()).mutate(); + } else { + var_column = std::move(owner_column); } auto& var = assert_cast(*var_column); var_column->finalize(); @@ -2194,15 +2198,13 @@ Status _parse_and_materialize_variant_columns(Block& block, auto expected_root_type = make_nullable(std::make_shared()); var.ensure_root_node_type(expected_root_type); - variant_column = var.assume_mutable(); + variant_column = std::move(var_column); } // Wrap variant with nullmap if it is nullable ColumnPtr result = variant_column->get_ptr(); if (is_nullable) { - const auto& null_map = - assert_cast(*column_ref).get_null_map_column_ptr(); - result = ColumnNullable::create(result, null_map); + result = ColumnNullable::create(result, nullable_null_map); } block.get_by_position(variant_pos[i]).column = result; } diff --git a/be/src/exec/exchange/local_exchanger.cpp b/be/src/exec/exchange/local_exchanger.cpp index 620aae737050d6..a248940dc63c81 100644 --- a/be/src/exec/exchange/local_exchanger.cpp +++ b/be/src/exec/exchange/local_exchanger.cpp @@ -167,6 +167,7 @@ Status ShuffleExchanger::get_block(RuntimeState* state, Block* block, bool* eos, mutable_block = VectorizedUtils::build_mutable_mem_reuse_block( block, partitioned_block.first->_data_block); RETURN_IF_ERROR(get_data()); + block->set_columns(std::move(mutable_block.mutable_columns())); } return Status::OK(); } @@ -212,7 +213,7 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const std::vectorsize() > 0); + DCHECK(shuffle_idx_to_instance_idx && !shuffle_idx_to_instance_idx->empty()); const auto& map = *shuffle_idx_to_instance_idx; int32_t enqueue_rows = 0; for (const auto& it : map) { @@ -425,6 +426,7 @@ Status BroadcastExchanger::get_block(RuntimeState* state, Block* block, bool* eo RETURN_IF_ERROR(mutable_block.add_rows(&block_wrapper->_data_block, partitioned_block.second.offset_start, partitioned_block.second.length)); + block->set_columns(std::move(mutable_block.mutable_columns())); } return Status::OK(); @@ -573,6 +575,9 @@ Status AdaptivePassthroughExchanger::get_block(RuntimeState* state, Block* block mutable_block = VectorizedUtils::build_mutable_mem_reuse_block( block, partitioned_block.first->_data_block); RETURN_IF_ERROR(get_data()); + if (mutable_block.rows() > 0) { + block->set_columns(std::move(mutable_block.mutable_columns())); + } } return Status::OK(); } diff --git a/be/src/exec/operator/aggregation_sink_operator.cpp b/be/src/exec/operator/aggregation_sink_operator.cpp index f6a9c2cdc4211d..0808361ad74f86 100644 --- a/be/src/exec/operator/aggregation_sink_operator.cpp +++ b/be/src/exec/operator/aggregation_sink_operator.cpp @@ -299,16 +299,20 @@ Status AggSinkLocalState::_merge_with_serialized_key_helper(Block* block) { for (int i = 0; i < key_size; ++i) { if constexpr (for_spill) { - key_columns[i] = block->get_by_position(i).column.get(); key_locs[i] = i; } else { int& result_column_id = key_locs[i]; RETURN_IF_ERROR( Base::_shared_state->probe_expr_ctxs[i]->execute(block, &result_column_id)); block->replace_by_position_if_const(result_column_id); - key_columns[i] = block->get_by_position(result_column_id).column.get(); } - key_columns[i]->assume_mutable()->replace_float_special_values(); + { + auto mutable_col = + IColumn::mutate(std::move(block->get_by_position(key_locs[i]).column)); + mutable_col->replace_float_special_values(); + block->get_by_position(key_locs[i]).column = std::move(mutable_col); + key_columns[i] = block->get_by_position(key_locs[i]).column.get(); + } } size_t rows = block->rows(); @@ -491,8 +495,13 @@ Status AggSinkLocalState::_execute_with_serialized_key_helper(Block* block) { block->get_by_position(result_column_id).column = block->get_by_position(result_column_id) .column->convert_to_full_column_if_const(); + { + auto mutable_col = + IColumn::mutate(std::move(block->get_by_position(result_column_id).column)); + mutable_col->replace_float_special_values(); + block->get_by_position(result_column_id).column = std::move(mutable_col); + } key_columns[i] = block->get_by_position(result_column_id).column.get(); - key_columns[i]->assume_mutable()->replace_float_special_values(); } } diff --git a/be/src/exec/operator/aggregation_source_operator.cpp b/be/src/exec/operator/aggregation_source_operator.cpp index d5385efdd06fe0..05e6a30c612249 100644 --- a/be/src/exec/operator/aggregation_source_operator.cpp +++ b/be/src/exec/operator/aggregation_source_operator.cpp @@ -113,7 +113,7 @@ Status AggLocalState::_get_results_with_serialized_key(RuntimeState* state, Bloc MutableColumns key_columns; for (int i = 0; i < key_size; ++i) { if (mem_reuse) { - key_columns.emplace_back(std::move(*block->get_by_position(i).column).mutate()); + key_columns.emplace_back(IColumn::mutate(std::move(block->get_by_position(i).column))); } else { key_columns.emplace_back( shared_state.probe_expr_ctxs[i]->root()->data_type()->create_column()); @@ -121,149 +121,156 @@ Status AggLocalState::_get_results_with_serialized_key(RuntimeState* state, Bloc } std::visit( - Overload { - [&](std::monostate& arg) -> void { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table"); - }, - [&](auto& agg_method) -> void { - agg_method.init_iterator(); - auto& data = *agg_method.hash_table; - const auto size = std::min(data.size(), size_t(state->batch_size())); - using KeyType = std::decay_t::Key; - std::vector keys(size); - - if (shared_state.use_simple_count) { - DCHECK_EQ(shared_state.aggregate_evaluators.size(), 1); - - value_data_types[0] = shared_state.aggregate_evaluators[0] - ->function() - ->get_serialized_type(); - if (mem_reuse) { - value_columns[0] = - std::move(*block->get_by_position(key_size).column) - .mutate(); - } else { - value_columns[0] = shared_state.aggregate_evaluators[0] - ->function() - ->create_serialize_column(); - } - - auto& count_col = - assert_cast(*value_columns[0]); - uint32_t num_rows = 0; - { - SCOPED_TIMER(_hash_table_iterate_timer); - auto& it = agg_method.begin; - while (it != agg_method.end && num_rows < state->batch_size()) { - keys[num_rows] = it.get_first(); - auto inline_count = - reinterpret_cast(it.get_second()); - count_col.insert_data( - reinterpret_cast(&inline_count), - sizeof(UInt64)); - ++it; - ++num_rows; - } - } - - { - SCOPED_TIMER(_insert_keys_to_column_timer); - agg_method.insert_keys_into_columns(keys, key_columns, num_rows); - } - - // Handle null key if present - if (agg_method.begin == agg_method.end) { - if (agg_method.hash_table->has_null_key_data()) { - DCHECK(key_columns.size() == 1); - DCHECK(key_columns[0]->is_nullable()); - if (num_rows < state->batch_size()) { - key_columns[0]->insert_data(nullptr, 0); - auto mapped = - agg_method.hash_table->template get_null_key_data< - AggregateDataPtr>(); - count_col.resize(num_rows + 1); - *reinterpret_cast(count_col.get_data().data() + - num_rows * sizeof(UInt64)) = - std::bit_cast(mapped); - *eos = true; - } - } else { - *eos = true; - } - } - return; - } - - if (shared_state.values.size() < size + 1) { - shared_state.values.resize(size + 1); - } - - uint32_t num_rows = 0; - shared_state.aggregate_data_container->init_once(); - auto& iter = shared_state.aggregate_data_container->iterator; - - { - SCOPED_TIMER(_hash_table_iterate_timer); - while (iter != shared_state.aggregate_data_container->end() && - num_rows < state->batch_size()) { - keys[num_rows] = iter.template get_key(); - shared_state.values[num_rows] = iter.get_aggregate_data(); - ++iter; - ++num_rows; - } - } + Overload {[&](std::monostate& arg) -> void { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "uninited hash table"); + }, + [&](auto& agg_method) -> void { + agg_method.init_iterator(); + auto& data = *agg_method.hash_table; + const auto size = std::min(data.size(), size_t(state->batch_size())); + using KeyType = std::decay_t::Key; + std::vector keys(size); + + if (shared_state.use_simple_count) { + DCHECK_EQ(shared_state.aggregate_evaluators.size(), 1); + + value_data_types[0] = shared_state.aggregate_evaluators[0] + ->function() + ->get_serialized_type(); + if (mem_reuse) { + value_columns[0] = IColumn::mutate( + std::move(block->get_by_position(key_size).column)); + } else { + value_columns[0] = shared_state.aggregate_evaluators[0] + ->function() + ->create_serialize_column(); + } - { - SCOPED_TIMER(_insert_keys_to_column_timer); - agg_method.insert_keys_into_columns(keys, key_columns, num_rows); - } + auto& count_col = + assert_cast(*value_columns[0]); + uint32_t num_rows = 0; + { + SCOPED_TIMER(_hash_table_iterate_timer); + auto& it = agg_method.begin; + while (it != agg_method.end && num_rows < state->batch_size()) { + keys[num_rows] = it.get_first(); + auto inline_count = + reinterpret_cast(it.get_second()); + count_col.insert_data( + reinterpret_cast(&inline_count), + sizeof(UInt64)); + ++it; + ++num_rows; + } + } - if (iter == shared_state.aggregate_data_container->end()) { - if (agg_method.hash_table->has_null_key_data()) { - // only one key of group by support wrap null key - // here need additional processing logic on the null key / value - DCHECK(key_columns.size() == 1); - DCHECK(key_columns[0]->is_nullable()); - if (agg_method.hash_table->has_null_key_data()) { - key_columns[0]->insert_data(nullptr, 0); - shared_state.values[num_rows] = - agg_method.hash_table->template get_null_key_data< - AggregateDataPtr>(); - ++num_rows; - *eos = true; - } - } else { - *eos = true; - } - } + { + SCOPED_TIMER(_insert_keys_to_column_timer); + agg_method.insert_keys_into_columns(keys, key_columns, num_rows); + } - { - SCOPED_TIMER(_insert_values_to_column_timer); - for (size_t i = 0; i < shared_state.aggregate_evaluators.size(); ++i) { - value_data_types[i] = shared_state.aggregate_evaluators[i] - ->function() - ->get_serialized_type(); - if (mem_reuse) { - value_columns[i] = - std::move(*block->get_by_position(i + key_size).column) - .mutate(); - } else { - value_columns[i] = shared_state.aggregate_evaluators[i] - ->function() - ->create_serialize_column(); - } - shared_state.aggregate_evaluators[i] - ->function() - ->serialize_to_column( - shared_state.values, - shared_state.offsets_of_aggregate_states[i], - value_columns[i], num_rows); - } - } - }}, + // Handle null key if present + if (agg_method.begin == agg_method.end) { + if (agg_method.hash_table->has_null_key_data()) { + DCHECK(key_columns.size() == 1); + DCHECK(key_columns[0]->is_nullable()); + if (num_rows < state->batch_size()) { + key_columns[0]->insert_data(nullptr, 0); + auto mapped = + agg_method.hash_table->template get_null_key_data< + AggregateDataPtr>(); + count_col.resize(num_rows + 1); + *reinterpret_cast(count_col.get_data().data() + + num_rows * sizeof(UInt64)) = + std::bit_cast(mapped); + *eos = true; + } + } else { + *eos = true; + } + } + return; + } + + if (shared_state.values.size() < size + 1) { + shared_state.values.resize(size + 1); + } + + uint32_t num_rows = 0; + shared_state.aggregate_data_container->init_once(); + auto& iter = shared_state.aggregate_data_container->iterator; + + { + SCOPED_TIMER(_hash_table_iterate_timer); + while (iter != shared_state.aggregate_data_container->end() && + num_rows < state->batch_size()) { + keys[num_rows] = iter.template get_key(); + shared_state.values[num_rows] = iter.get_aggregate_data(); + ++iter; + ++num_rows; + } + } + + { + SCOPED_TIMER(_insert_keys_to_column_timer); + agg_method.insert_keys_into_columns(keys, key_columns, num_rows); + } + + if (iter == shared_state.aggregate_data_container->end()) { + if (agg_method.hash_table->has_null_key_data()) { + // only one key of group by support wrap null key + // here need additional processing logic on the null key / value + DCHECK(key_columns.size() == 1); + DCHECK(key_columns[0]->is_nullable()); + if (agg_method.hash_table->has_null_key_data()) { + key_columns[0]->insert_data(nullptr, 0); + shared_state.values[num_rows] = + agg_method.hash_table->template get_null_key_data< + AggregateDataPtr>(); + ++num_rows; + *eos = true; + } + } else { + *eos = true; + } + } + + { + SCOPED_TIMER(_insert_values_to_column_timer); + for (size_t i = 0; i < shared_state.aggregate_evaluators.size(); + ++i) { + value_data_types[i] = shared_state.aggregate_evaluators[i] + ->function() + ->get_serialized_type(); + if (mem_reuse) { + value_columns[i] = IColumn::mutate(std::move( + block->get_by_position(i + key_size).column)); + } else { + value_columns[i] = shared_state.aggregate_evaluators[i] + ->function() + ->create_serialize_column(); + } + shared_state.aggregate_evaluators[i] + ->function() + ->serialize_to_column( + shared_state.values, + shared_state.offsets_of_aggregate_states[i], + value_columns[i], num_rows); + } + } + }}, shared_state.agg_data->method_variant); - if (!mem_reuse) { + if (mem_reuse) { + MutableColumns columns(block->columns()); + for (int i = 0; i < key_size; ++i) { + columns[i] = std::move(key_columns[i]); + } + for (int i = 0; i < agg_size; ++i) { + columns[key_size + i] = std::move(value_columns[i]); + } + block->set_columns(std::move(columns)); + } else { ColumnsWithTypeAndName columns_with_schema; for (int i = 0; i < key_size; ++i) { columns_with_schema.emplace_back(std::move(key_columns[i]), @@ -294,7 +301,7 @@ Status AggLocalState::_get_with_serialized_key_result(RuntimeState* state, Block if (!mem_reuse) { key_columns.emplace_back(columns_with_schema[i].type->create_column()); } else { - key_columns.emplace_back(std::move(*block->get_by_position(i).column).mutate()); + key_columns.emplace_back(IColumn::mutate(std::move(block->get_by_position(i).column))); } } MutableColumns value_columns; @@ -302,7 +309,8 @@ Status AggLocalState::_get_with_serialized_key_result(RuntimeState* state, Block if (!mem_reuse) { value_columns.emplace_back(columns_with_schema[i].type->create_column()); } else { - value_columns.emplace_back(std::move(*block->get_by_position(i).column).mutate()); + value_columns.emplace_back( + IColumn::mutate(std::move(block->get_by_position(i).column))); } } @@ -420,7 +428,17 @@ Status AggLocalState::_get_with_serialized_key_result(RuntimeState* state, Block }}, shared_state.agg_data->method_variant); - if (!mem_reuse) { + if (mem_reuse) { + MutableColumns columns(block->columns()); + for (int i = 0; i < block->columns(); ++i) { + if (i < key_size) { + columns[i] = std::move(key_columns[i]); + } else { + columns[i] = std::move(value_columns[i - key_size]); + } + } + block->set_columns(std::move(columns)); + } else { *block = columns_with_schema; MutableColumns columns(block->columns()); for (int i = 0; i < block->columns(); ++i) { diff --git a/be/src/exec/operator/bucketed_aggregation_sink_operator.cpp b/be/src/exec/operator/bucketed_aggregation_sink_operator.cpp index 58f47001185983..8cb58b2d532b95 100644 --- a/be/src/exec/operator/bucketed_aggregation_sink_operator.cpp +++ b/be/src/exec/operator/bucketed_aggregation_sink_operator.cpp @@ -175,8 +175,11 @@ Status BucketedAggSinkLocalState::_execute_with_serialized_key(Block* block) { block->get_by_position(result_column_id).column = block->get_by_position(result_column_id) .column->convert_to_full_column_if_const(); + auto mutable_column = + IColumn::mutate(std::move(block->get_by_position(result_column_id).column)); + mutable_column->replace_float_special_values(); + block->get_by_position(result_column_id).column = std::move(mutable_column); key_columns[i] = block->get_by_position(result_column_id).column.get(); - key_columns[i]->assume_mutable()->replace_float_special_values(); } } diff --git a/be/src/exec/operator/bucketed_aggregation_source_operator.cpp b/be/src/exec/operator/bucketed_aggregation_source_operator.cpp index 966b0acbc90a08..e1bd71089557fb 100644 --- a/be/src/exec/operator/bucketed_aggregation_source_operator.cpp +++ b/be/src/exec/operator/bucketed_aggregation_source_operator.cpp @@ -328,7 +328,8 @@ void BucketedAggLocalState::_build_output_block(Block* block, MutableColumns& ke MutableColumns value_columns; for (size_t i = key_size; i < columns_with_schema.size(); ++i) { if (mem_reuse) { - value_columns.emplace_back(std::move(*block->get_by_position(i).column).mutate()); + value_columns.emplace_back( + IColumn::mutate(std::move(block->get_by_position(i).column))); } else { value_columns.emplace_back(columns_with_schema[i].type->create_column()); } @@ -362,6 +363,15 @@ void BucketedAggLocalState::_build_output_block(Block* block, MutableColumns& ke columns_with_schema[key_size + i].type, ""); } *block = Block(result_columns); + } else { + MutableColumns columns(block->columns()); + for (size_t i = 0; i < key_size; ++i) { + columns[i] = std::move(key_columns[i]); + } + for (size_t i = 0; i < agg_size; ++i) { + columns[key_size + i] = std::move(value_columns[i]); + } + block->set_columns(std::move(columns)); } } else { // Serialize path. simple_count should always finalize. @@ -373,7 +383,8 @@ void BucketedAggLocalState::_build_output_block(Block* block, MutableColumns& ke value_data_types[i] = shared_state.aggregate_evaluators[i]->function()->get_serialized_type(); if (mem_reuse) { - value_columns[i] = std::move(*block->get_by_position(key_size + i).column).mutate(); + value_columns[i] = + IColumn::mutate(std::move(block->get_by_position(key_size + i).column)); } else { value_columns[i] = shared_state.aggregate_evaluators[i]->function()->create_serialize_column(); @@ -394,6 +405,15 @@ void BucketedAggLocalState::_build_output_block(Block* block, MutableColumns& ke result_columns.emplace_back(std::move(value_columns[i]), value_data_types[i], ""); } *block = Block(result_columns); + } else { + MutableColumns columns(block->columns()); + for (size_t i = 0; i < key_size; ++i) { + columns[i] = std::move(key_columns[i]); + } + for (size_t i = 0; i < agg_size; ++i) { + columns[key_size + i] = std::move(value_columns[i]); + } + block->set_columns(std::move(columns)); } } } @@ -452,8 +472,8 @@ Status BucketedAggLocalState::_output_bucket(RuntimeState* state, Block* block, MutableColumns key_columns; for (size_t i = 0; i < key_size; ++i) { if (mem_reuse) { - key_columns.emplace_back( - std::move(*block->get_by_position(i).column).mutate()); + key_columns.emplace_back(IColumn::mutate( + std::move(block->get_by_position(i).column))); } else { key_columns.emplace_back(shared_state.probe_expr_ctxs[i] ->root() @@ -535,8 +555,8 @@ Status BucketedAggLocalState::_merge_and_output_null_keys(RuntimeState* state, B MutableColumns key_columns; for (size_t i = 0; i < key_size; ++i) { if (mem_reuse) { - key_columns.emplace_back( - std::move(*block->get_by_position(i).column).mutate()); + key_columns.emplace_back(IColumn::mutate( + std::move(block->get_by_position(i).column))); } else { key_columns.emplace_back(shared_state.probe_expr_ctxs[i] ->root() diff --git a/be/src/exec/operator/cache_source_operator.cpp b/be/src/exec/operator/cache_source_operator.cpp index aec8206f54b682..06731ff8ed54c0 100644 --- a/be/src/exec/operator/cache_source_operator.cpp +++ b/be/src/exec/operator/cache_source_operator.cpp @@ -156,7 +156,9 @@ Status CacheSourceOperatorX::get_block(RuntimeState* state, Block* block, bool* if (need_clone_empty) { *block = output_block->clone_empty(); } - RETURN_IF_ERROR(MutableBlock::build_mutable_block(block).merge(*output_block)); + auto mutable_block = MutableBlock::build_mutable_block(block); + RETURN_IF_ERROR(mutable_block.merge(*output_block)); + block->set_columns(std::move(mutable_block.mutable_columns())); local_state._current_query_cache_rows += output_block->rows(); auto mem_consume = output_block->allocated_bytes(); local_state._current_query_cache_bytes += mem_consume; @@ -179,7 +181,9 @@ Status CacheSourceOperatorX::get_block(RuntimeState* state, Block* block, bool* if (need_clone_empty) { *block = hit_cache_block->clone_empty(); } - RETURN_IF_ERROR(MutableBlock::build_mutable_block(block).merge(*hit_cache_block)); + auto mutable_block = MutableBlock::build_mutable_block(block); + RETURN_IF_ERROR(mutable_block.merge(*hit_cache_block)); + block->set_columns(std::move(mutable_block.mutable_columns())); if (!local_state._hit_cache_column_orders.empty()) { auto datas = block->get_columns_with_type_and_name(); block->clear(); diff --git a/be/src/exec/operator/distinct_streaming_aggregation_operator.cpp b/be/src/exec/operator/distinct_streaming_aggregation_operator.cpp index 298896401d6f3e..92c11cf2896154 100644 --- a/be/src/exec/operator/distinct_streaming_aggregation_operator.cpp +++ b/be/src/exec/operator/distinct_streaming_aggregation_operator.cpp @@ -162,7 +162,13 @@ Status DistinctStreamingAggLocalState::_distinct_pre_agg_with_serialized_key( in_block->get_by_position(result_column_id) .column->convert_to_full_column_if_const(); key_columns[i] = in_block->get_by_position(result_column_id).column.get(); - key_columns[i]->assume_mutable()->replace_float_special_values(); + { + auto mutable_col = IColumn::mutate( + std::move(in_block->get_by_position(result_column_id).column)); + mutable_col->replace_float_special_values(); + in_block->get_by_position(result_column_id).column = std::move(mutable_col); + key_columns[i] = in_block->get_by_position(result_column_id).column.get(); + } result_idxs[i] = result_column_id; } } @@ -210,18 +216,22 @@ Status DistinctStreamingAggLocalState::_distinct_pre_agg_with_serialized_key( if (out_block->rows() + _distinct_row.size() > batch_size) { size_t split_size = batch_size - out_block->rows(); for (int i = 0; i < key_size; ++i) { - auto output_dst = out_block->get_by_position(i).column->assume_mutable(); + auto output_dst = + IColumn::mutate(std::move(out_block->get_by_position(i).column)); key_columns[i]->append_data_by_selector(output_dst, _distinct_row, 0, split_size); - auto cache_dst = _cache_block.get_by_position(i).column->assume_mutable(); + out_block->get_by_position(i).column = std::move(output_dst); + auto cache_dst = + IColumn::mutate(std::move(_cache_block.get_by_position(i).column)); key_columns[i]->append_data_by_selector(cache_dst, _distinct_row, split_size, _distinct_row.size()); + _cache_block.get_by_position(i).column = std::move(cache_dst); } } else { for (int i = 0; i < key_size; ++i) { - auto output_column = out_block->get_by_position(i).column; - auto dst = output_column->assume_mutable(); + auto dst = IColumn::mutate(std::move(out_block->get_by_position(i).column)); key_columns[i]->append_data_by_selector(dst, _distinct_row); + out_block->get_by_position(i).column = std::move(dst); } } } diff --git a/be/src/exec/operator/hashjoin_build_sink.cpp b/be/src/exec/operator/hashjoin_build_sink.cpp index 4c5815c71ab691..4a35b07b8e7222 100644 --- a/be/src/exec/operator/hashjoin_build_sink.cpp +++ b/be/src/exec/operator/hashjoin_build_sink.cpp @@ -576,7 +576,9 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, Blo for (auto& data : block) { data.column = std::move(*data.column).mutate()->convert_column_if_overflow(); if (p._need_finalize_variant_column) { - std::move(*data.column).mutate()->finalize(); + auto mutable_column = IColumn::mutate(std::move(data.column)); + mutable_column->finalize(); + data.column = std::move(mutable_column); } } diff --git a/be/src/exec/operator/hashjoin_build_sink.h b/be/src/exec/operator/hashjoin_build_sink.h index be77ef6cc690bc..3c3faabcdb534e 100644 --- a/be/src/exec/operator/hashjoin_build_sink.h +++ b/be/src/exec/operator/hashjoin_build_sink.h @@ -231,7 +231,7 @@ struct ProcessHashTableBuild { // In order to make the null keys equal when using single null eq, all null keys need to be set to default value. if (_build_raw_ptrs.size() == 1 && null_map && *has_null_key) { - _build_raw_ptrs[0]->assume_mutable()->replace_column_null_data(null_map->data()); + const_cast(_build_raw_ptrs[0])->replace_column_null_data(null_map->data()); } hash_table_ctx.init_serialized_keys(_build_raw_ptrs, _rows, diff --git a/be/src/exec/operator/join/process_hash_table_probe_impl.h b/be/src/exec/operator/join/process_hash_table_probe_impl.h index 5bfd2ff4e0cbfc..bcc4408906bf54 100644 --- a/be/src/exec/operator/join/process_hash_table_probe_impl.h +++ b/be/src/exec/operator/join/process_hash_table_probe_impl.h @@ -164,7 +164,10 @@ void ProcessHashTableProbe::probe_side_output_column(MutableColumns& for (int i = 0; i < _left_output_slot_flags.size(); ++i) { if (_left_output_slot_flags[i]) { if (_parent_operator->need_finalize_variant_column()) { - std::move(*probe_block.get_by_position(i).column).mutate()->finalize(); + auto mutable_column = + IColumn::mutate(std::move(probe_block.get_by_position(i).column)); + mutable_column->finalize(); + probe_block.get_by_position(i).column = std::move(mutable_column); } } @@ -200,7 +203,8 @@ typename HashTableType::State ProcessHashTableProbe::_init_probe_sid // In order to make the null keys equal when using single null eq, all null keys need to be set to default value. if (_parent->_probe_columns.size() == 1 && null_map) { if (simd::contain_one(null_map, probe_rows)) { - _parent->_probe_columns[0]->assume_mutable()->replace_column_null_data(null_map); + const_cast(_parent->_probe_columns[0]) + ->replace_column_null_data(null_map); } } @@ -650,9 +654,11 @@ Status ProcessHashTableProbe::finalize_block_with_filter(Block* outp ->get_data_column_ptr(); auto& src = source_block->get_by_position(column_id).column; - auto dst = output_block->get_by_position(output_column_id).column->assume_mutable(); + auto dst = IColumn::mutate( + std::move(output_block->get_by_position(output_column_id).column)); dst->clear(); insert_with_indexs(dst, src, container, all_match_one); + output_block->get_by_position(output_column_id).column = std::move(dst); } }; do_lazy_materialize(_right_output_slot_flags, _build_indexs, (int)_right_col_idx, @@ -717,14 +723,17 @@ Status ProcessHashTableProbe::do_mark_join_conjuncts(Block* output_b return Status::OK(); } - auto mark_column_mutable = - output_block->get_by_position(_parent->_mark_column_id).column->assume_mutable(); - auto& mark_column = assert_cast(*mark_column_mutable); - IColumn::Filter& filter = assert_cast(mark_column.get_nested_column()).get_data(); + auto mark_column_mutable = IColumn::mutate( + std::move(output_block->get_by_position(_parent->_mark_column_id).column)); + auto* mark_column = assert_cast(mark_column_mutable.get()); + IColumn::Filter& filter = + assert_cast(mark_column->get_nested_column()).get_data(); + auto& null_map_column = mark_column->get_null_map_column(); + output_block->replace_by_position(_parent->_mark_column_id, std::move(mark_column_mutable)); RETURN_IF_ERROR(VExprContext::execute_conjuncts(_parent->_mark_join_conjuncts, output_block, - mark_column.get_null_map_column(), filter)); + null_map_column, filter)); uint8_t* mark_filter_data = filter.data(); - uint8_t* mark_null_map = mark_column.get_null_map_data().data(); + uint8_t* mark_null_map = mark_column->get_null_map_data().data(); if (is_null_aware_join) { // For null aware anti/semi join, if the equal conjuncts was not matched and the build side has null value, diff --git a/be/src/exec/operator/nested_loop_join_probe_operator.cpp b/be/src/exec/operator/nested_loop_join_probe_operator.cpp index ccc7140c726d07..7fdb3f61bd63c7 100644 --- a/be/src/exec/operator/nested_loop_join_probe_operator.cpp +++ b/be/src/exec/operator/nested_loop_join_probe_operator.cpp @@ -139,10 +139,12 @@ Status NestedLoopJoinProbeLocalState::close(RuntimeState* state) { void NestedLoopJoinProbeLocalState::_update_additional_flags(Block* block) { auto& p = _parent->cast(); if (p._is_mark_join) { - auto mark_column = block->get_by_position(block->columns() - 1).column->assume_mutable(); + auto mark_column = + IColumn::mutate(std::move(block->get_by_position(block->columns() - 1).column)); if (mark_column->size() < block->rows()) { ColumnFilterHelper(*mark_column).resize_fill(block->rows(), 1); } + block->replace_by_position(block->columns() - 1, std::move(mark_column)); } } diff --git a/be/src/exec/operator/operator.cpp b/be/src/exec/operator/operator.cpp index 96f7933d5d5274..f1a4734bcedb4d 100644 --- a/be/src/exec/operator/operator.cpp +++ b/be/src/exec/operator/operator.cpp @@ -362,8 +362,8 @@ Status OperatorXBase::do_projections(RuntimeState* state, Block* origin_block, MutableBlock mutable_block = VectorizedUtils::build_mutable_mem_reuse_block(output_block, *_output_row_descriptor); + auto& mutable_columns = mutable_block.mutable_columns(); if (rows != 0) { - auto& mutable_columns = mutable_block.mutable_columns(); DCHECK_EQ(mutable_columns.size(), local_state->_projections.size()) << debug_string(); for (int i = 0; i < mutable_columns.size(); ++i) { ColumnPtr column_ptr; @@ -379,8 +379,8 @@ Status OperatorXBase::do_projections(RuntimeState* state, Block* origin_block, insert_column_datas(mutable_columns[i], column_ptr, rows); } DCHECK(mutable_block.rows() == rows); - output_block->set_columns(std::move(mutable_columns)); } + output_block->set_columns(std::move(mutable_columns)); local_state->_estimate_memory_usage += bytes_usage; diff --git a/be/src/exec/operator/repeat_operator.cpp b/be/src/exec/operator/repeat_operator.cpp index 82ffa633056a41..b0aa6989a35f34 100644 --- a/be/src/exec/operator/repeat_operator.cpp +++ b/be/src/exec/operator/repeat_operator.cpp @@ -154,6 +154,7 @@ Status RepeatLocalState::get_repeated_block(Block* input_block, int repeat_id_id RETURN_IF_ERROR(add_grouping_id_column(rows, cur_col, output_columns, repeat_id_idx)); DCHECK_EQ(cur_col, output_column_size); + output_block->set_columns(std::move(m_block.mutable_columns())); return Status::OK(); } @@ -237,6 +238,7 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, Block* output_block, bo std::size_t cur_col = 0; RETURN_IF_ERROR( local_state.add_grouping_id_column(rows, cur_col, columns, _repeat_id_idx)); + output_block->set_columns(std::move(m_block.mutable_columns())); _repeat_id_idx++; if (_repeat_id_idx >= _repeat_id_list_size) { diff --git a/be/src/exec/operator/schema_scan_operator.cpp b/be/src/exec/operator/schema_scan_operator.cpp index 030e49b54d48c0..3d5922573b90e4 100644 --- a/be/src/exec/operator/schema_scan_operator.cpp +++ b/be/src/exec/operator/schema_scan_operator.cpp @@ -21,6 +21,7 @@ #include +#include "core/column/column_nullable.h" #include "core/data_type/data_type_factory.hpp" #include "exec/operator/operator.h" #include "runtime/runtime_profile.h" @@ -256,10 +257,16 @@ Status SchemaScanOperatorX::get_block(RuntimeState* state, Block* block, bool* e if (src_block.rows()) { // block->check_number_of_rows(); for (int i = 0; i < _slot_num; ++i) { - MutableColumnPtr column_ptr = std::move(*block->get_by_position(i).column).mutate(); - column_ptr->insert_range_from( - *src_block.safe_get_by_position(_slot_offsets[i]).column, 0, - src_block.rows()); + MutableColumnPtr column_ptr = + IColumn::mutate(std::move(block->get_by_position(i).column)); + ColumnPtr src_column = src_block.safe_get_by_position(_slot_offsets[i]) + .column->convert_to_full_column_if_const(); + if (column_ptr->is_nullable() && !src_column->is_nullable()) { + src_column = make_nullable(src_column); + } + DORIS_CHECK(column_ptr->is_nullable() == src_column->is_nullable()); + column_ptr->insert_range_from(*src_column, 0, src_block.rows()); + block->replace_by_position(i, std::move(column_ptr)); } DCHECK_EQ(block->columns(), _dest_tuple_desc->slots().size()); RETURN_IF_ERROR(local_state.filter_block(local_state._conjuncts, block)); diff --git a/be/src/exec/operator/set_source_operator.cpp b/be/src/exec/operator/set_source_operator.cpp index a314f411311069..5cc299e7dbb485 100644 --- a/be/src/exec/operator/set_source_operator.cpp +++ b/be/src/exec/operator/set_source_operator.cpp @@ -114,7 +114,7 @@ void SetSourceOperatorX::_create_mutable_cols( for (int i = 0; i < local_state._left_table_data_types.size(); ++i) { if (mem_reuse) { local_state._mutable_cols[i] = - std::move(*output_block->get_by_position(i).column).mutate(); + IColumn::mutate(std::move(output_block->get_by_position(i).column)); } else { local_state._mutable_cols[i] = (local_state._left_table_data_types[i]->create_column()); } @@ -173,6 +173,9 @@ Status SetSourceOperatorX::_get_data_in_hashtable( local_state._left_table_data_types[i], "")); } } else { + for (int i = 0; i < left_col_len; ++i) { + output_block->replace_by_position(i, std::move(local_state._mutable_cols[i])); + } local_state._mutable_cols.clear(); } diff --git a/be/src/exec/operator/streaming_aggregation_operator.cpp b/be/src/exec/operator/streaming_aggregation_operator.cpp index 5744b288a4487e..49eba560e4bb19 100644 --- a/be/src/exec/operator/streaming_aggregation_operator.cpp +++ b/be/src/exec/operator/streaming_aggregation_operator.cpp @@ -330,8 +330,11 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::Block* in_blo in_block->get_by_position(result_column_id).column = in_block->get_by_position(result_column_id) .column->convert_to_full_column_if_const(); + auto mutable_column = + IColumn::mutate(std::move(in_block->get_by_position(result_column_id).column)); + mutable_column->replace_float_special_values(); + in_block->get_by_position(result_column_id).column = std::move(mutable_column); key_columns[i] = in_block->get_by_position(result_column_id).column.get(); - key_columns[i]->assume_mutable()->replace_float_special_values(); } } @@ -370,8 +373,8 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::Block* in_blo for (int i = 0; i < _aggregate_evaluators.size(); ++i) { auto data_type = _aggregate_evaluators[i]->function()->get_serialized_type(); if (mem_reuse) { - value_columns.emplace_back( - std::move(*out_block->get_by_position(i + key_size).column).mutate()); + value_columns.emplace_back(IColumn::mutate( + std::move(out_block->get_by_position(i + key_size).column))); } else { value_columns.emplace_back( _aggregate_evaluators[i]->function()->create_serialize_column()); @@ -397,11 +400,15 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::Block* in_blo } out_block->swap(Block(columns_with_schema)); } else { + MutableColumns columns(out_block->columns()); for (int i = 0; i < key_size; ++i) { - std::move(*out_block->get_by_position(i).column) - .mutate() - ->insert_range_from(*key_columns[i], 0, rows); + columns[i] = IColumn::mutate(std::move(out_block->get_by_position(i).column)); + columns[i]->insert_range_from(*key_columns[i], 0, rows); + } + for (int i = 0; i < value_columns.size(); ++i) { + columns[key_size + i] = std::move(value_columns[i]); } + out_block->set_columns(std::move(columns)); } } else { bool need_agg = true; @@ -462,7 +469,7 @@ Status StreamingAggLocalState::_get_results_with_serialized_key(RuntimeState* st MutableColumns key_columns; for (int i = 0; i < key_size; ++i) { if (mem_reuse) { - key_columns.emplace_back(std::move(*block->get_by_position(i).column).mutate()); + key_columns.emplace_back(IColumn::mutate(std::move(block->get_by_position(i).column))); } else { key_columns.emplace_back(_probe_expr_ctxs[i]->root()->data_type()->create_column()); } @@ -486,9 +493,8 @@ Status StreamingAggLocalState::_get_results_with_serialized_key(RuntimeState* st value_data_types[0] = _aggregate_evaluators[0]->function()->get_serialized_type(); if (mem_reuse) { - value_columns[0] = - std::move(*block->get_by_position(key_size).column) - .mutate(); + value_columns[0] = IColumn::mutate( + std::move(block->get_by_position(key_size).column)); } else { value_columns[0] = _aggregate_evaluators[0] ->function() @@ -590,9 +596,8 @@ Status StreamingAggLocalState::_get_results_with_serialized_key(RuntimeState* st value_data_types[i] = _aggregate_evaluators[i]->function()->get_serialized_type(); if (mem_reuse) { - value_columns[i] = - std::move(*block->get_by_position(i + key_size).column) - .mutate(); + value_columns[i] = IColumn::mutate( + std::move(block->get_by_position(i + key_size).column)); } else { value_columns[i] = _aggregate_evaluators[i] ->function() @@ -606,7 +611,16 @@ Status StreamingAggLocalState::_get_results_with_serialized_key(RuntimeState* st }}, _agg_data->method_variant); - if (!mem_reuse) { + if (mem_reuse) { + MutableColumns columns(block->columns()); + for (int i = 0; i < key_size; ++i) { + columns[i] = std::move(key_columns[i]); + } + for (int i = 0; i < agg_size; ++i) { + columns[key_size + i] = std::move(value_columns[i]); + } + block->set_columns(std::move(columns)); + } else { ColumnsWithTypeAndName columns_with_schema; for (int i = 0; i < key_size; ++i) { columns_with_schema.emplace_back(std::move(key_columns[i]), diff --git a/be/src/exec/operator/table_function_operator.cpp b/be/src/exec/operator/table_function_operator.cpp index 09e74f580dd1c2..fd97e8d69c68a8 100644 --- a/be/src/exec/operator/table_function_operator.cpp +++ b/be/src/exec/operator/table_function_operator.cpp @@ -560,6 +560,7 @@ Status TableFunctionLocalState::get_expanded_block(RuntimeState* state, Block* o for (auto index : p._useless_slot_indexs) { columns[index]->insert_many_defaults(row_size - columns[index]->size()); } + output_block->set_columns(std::move(columns)); { SCOPED_TIMER(_filter_timer); // 3. eval conjuncts diff --git a/be/src/exec/operator/union_sink_operator.h b/be/src/exec/operator/union_sink_operator.h index 4842ab6b243903..bdfb4a7303126e 100644 --- a/be/src/exec/operator/union_sink_operator.h +++ b/be/src/exec/operator/union_sink_operator.h @@ -168,9 +168,10 @@ class UnionSinkOperatorX MOCK_REMOVE(final) : public DataSinkOperatorXset_columns(std::move(mblock.mutable_columns())); } return Status::OK(); } }; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/exec/operator/union_source_operator.cpp b/be/src/exec/operator/union_source_operator.cpp index a484f1e4a324ba..9547c9a8184bcf 100644 --- a/be/src/exec/operator/union_source_operator.cpp +++ b/be/src/exec/operator/union_source_operator.cpp @@ -177,6 +177,9 @@ Status UnionSourceOperatorX::get_next_const(RuntimeState* state, Block* block) { tmp_block.clear(); } } + if (mblock.rows() > 0) { + block->set_columns(std::move(mblock.mutable_columns())); + } // some insert query like "insert into string_test select 1, repeat('a', 1024 * 1024);" // the const expr will be in output expr cause the union node return a empty block. so here we diff --git a/be/src/exec/rowid_fetcher.cpp b/be/src/exec/rowid_fetcher.cpp index 27c66197541f5e..7241520013c3a7 100644 --- a/be/src/exec/rowid_fetcher.cpp +++ b/be/src/exec/rowid_fetcher.cpp @@ -164,10 +164,12 @@ Status RowIDFetcher::_merge_rpc_results(const PMultiGetRequest& request, default_values[i] = _fetch_option.desc->slots()[i]->col_default_value(); } } + MutableColumns output_columns = output_block->mutate_columns(); + Defer restore_columns([&]() { output_block->set_columns(std::move(output_columns)); }); for (int i = 0; i < resp.binary_row_data_size(); ++i) { - RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_block( + RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_columns( serdes, resp.binary_row_data(i).data(), resp.binary_row_data(i).size(), - col_uid_to_idx, *output_block, default_values, {})); + col_uid_to_idx, output_columns, default_values, {})); } return Status::OK(); } @@ -190,11 +192,11 @@ Status RowIDFetcher::_merge_rpc_results(const PMultiGetRequest& request, partial_block.dump_types()); } else { for (int i = 0; i < output_block->columns(); ++i) { - output_block->get_by_position(i).column->assume_mutable()->insert_range_from( - *partial_block.get_by_position(i) - .column->convert_to_full_column_if_const() - .get(), + auto column = IColumn::mutate(std::move(output_block->get_by_position(i).column)); + column->insert_range_from( + *partial_block.get_by_position(i).column->convert_to_full_column_if_const(), 0, partial_block.rows()); + output_block->replace_by_position(i, std::move(column)); } } return Status::OK(); @@ -1122,6 +1124,8 @@ Status RowIdStorageReader::read_doris_format_row( return Status::InternalError("Tablet {} does not have row store for all columns", tablet->tablet_id()); } + MutableColumns result_columns = result_block.mutate_columns(); + Defer restore_columns([&]() { result_block.set_columns(std::move(result_columns)); }); for (auto row_id : row_ids) { RowLocation loc(rowset_id, segment->id(), cast_set(row_id)); row_store_read_struct.row_store_buffer.clear(); @@ -1132,15 +1136,16 @@ Status RowIdStorageReader::read_doris_format_row( }, lookup_row_data_ms)); - RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_block( + RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_columns( row_store_read_struct.serdes, row_store_read_struct.row_store_buffer.data(), row_store_read_struct.row_store_buffer.size(), - row_store_read_struct.col_uid_to_idx, result_block, + row_store_read_struct.col_uid_to_idx, result_columns, row_store_read_struct.default_values, {})); } } else { for (int x = 0; x < slots.size(); ++x) { - MutableColumnPtr column = result_block.get_by_position(x).column->assume_mutable(); + auto column_guard = result_block.mutate_column_scoped(x); + MutableColumnPtr& column = column_guard.mutable_column(); IteratorKey iterator_key {.tablet_id = tablet_id, .rowset_id = rowset_id, .segment_id = segment_id, @@ -1151,9 +1156,11 @@ Status RowIdStorageReader::read_doris_format_row( iterator_item.storage_read_options.stats = &stats; iterator_item.storage_read_options.io_ctx.reader_type = ReaderType::READER_QUERY; } - RETURN_IF_ERROR(segment->seek_and_read_by_rowid( - full_read_schema, &slots[x], row_ids, column, - iterator_item.storage_read_options, iterator_item.iterator)); + for (auto row_id : row_ids) { + RETURN_IF_ERROR(segment->seek_and_read_by_rowid( + full_read_schema, &slots[x], row_id, column, + iterator_item.storage_read_options, iterator_item.iterator)); + } } } return Status::OK(); diff --git a/be/src/exec/scan/file_scanner.cpp b/be/src/exec/scan/file_scanner.cpp index 5f1d248c1e1f4d..f621050e337d8f 100644 --- a/be/src/exec/scan/file_scanner.cpp +++ b/be/src/exec/scan/file_scanner.cpp @@ -438,8 +438,10 @@ Status FileScanner::_process_runtime_filters_partition_prune(bool& can_filter_al if (!first_column_filled) { // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0 // The following process may be tricky and time-consuming, but we have no other way. - _runtime_filter_partition_prune_block.get_by_position(0).column->assume_mutable()->resize( - partition_value_column_size); + auto column = IColumn::mutate( + std::move(_runtime_filter_partition_prune_block.get_by_position(0).column)); + column->resize(partition_value_column_size); + _runtime_filter_partition_prune_block.replace_by_position(0, std::move(column)); } IColumn::Filter result_filter(_runtime_filter_partition_prune_block.rows(), 1); RETURN_IF_ERROR(VExprContext::execute_conjuncts(_runtime_filter_partition_prune_ctxs, nullptr, @@ -778,11 +780,11 @@ Status FileScanner::_convert_to_output_block(Block* block) { auto& mutable_output_columns = mutable_output_block.mutable_columns(); std::vector* skip_bitmaps {nullptr}; + MutableColumnPtr skip_bitmap_column; if (_should_process_skip_bitmap_col()) { - auto* skip_bitmap_nullable_col_ptr = - assert_cast(_src_block_ptr->get_by_position(_skip_bitmap_col_idx) - .column->assume_mutable() - .get()); + skip_bitmap_column = IColumn::mutate( + std::move(_src_block_ptr->get_by_position(_skip_bitmap_col_idx).column)); + auto* skip_bitmap_nullable_col_ptr = assert_cast(skip_bitmap_column.get()); skip_bitmaps = &(assert_cast( skip_bitmap_nullable_col_ptr->get_nested_column_ptr().get()) ->get_data()); @@ -799,6 +801,7 @@ Status FileScanner::_convert_to_output_block(Block* block) { } } } + _src_block_ptr->replace_by_position(_skip_bitmap_col_idx, std::move(skip_bitmap_column)); } // for (auto slot_desc : _output_tuple_desc->slots()) { @@ -865,6 +868,7 @@ Status FileScanner::_convert_to_output_block(Block* block) { mutable_output_columns[j]->insert_range_from(*column_ptr, 0, rows); ctx_idx++; } + block->set_columns(std::move(mutable_output_columns)); // after do the dest block insert operation, clear _src_block to remove the reference of origin column _src_block_ptr->clear(); diff --git a/be/src/exec/scan/meta_scanner.cpp b/be/src/exec/scan/meta_scanner.cpp index adf1aabe4b8903..52892882f7bcbb 100644 --- a/be/src/exec/scan/meta_scanner.cpp +++ b/be/src/exec/scan/meta_scanner.cpp @@ -112,21 +112,14 @@ Status MetaScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eof columns.resize(column_size); for (auto i = 0; i < column_size; i++) { if (mem_reuse) { - columns[i] = block->get_by_position(i).column->assume_mutable(); + columns[i] = IColumn::mutate(std::move(block->get_by_position(i).column)); } else { columns[i] = _tuple_desc->slots()[i]->get_empty_mutable_column(); } } // fill block RETURN_IF_ERROR(_fill_block_with_remote_data(columns)); - if (_meta_eos == true) { - if (block->rows() == 0) { - *eof = true; - } - break; - } - // Before really use the Block, must clear other ptr of column in block - // So here need do std::move and clear in `columns` + const bool empty_result = columns.empty() || columns.front()->empty(); if (!mem_reuse) { int column_index = 0; for (const auto slot_desc : _tuple_desc->slots()) { @@ -135,7 +128,13 @@ Status MetaScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eof slot_desc->col_name())); } } else { - columns.clear(); + block->set_columns(std::move(columns)); + } + if (_meta_eos == true) { + if (empty_result) { + *eof = true; + } + break; } VLOG_ROW << "VMetaScanNode output rows: " << block->rows(); } while (block->rows() == 0 && !(*eof)); diff --git a/be/src/exec/scan/scanner.cpp b/be/src/exec/scan/scanner.cpp index ae1ed96e000768..5c9edc4f23dcab 100644 --- a/be/src/exec/scan/scanner.cpp +++ b/be/src/exec/scan/scanner.cpp @@ -239,7 +239,7 @@ Status Scanner::_do_projections(Block* origin_block, Block* output_block) { if (mutable_columns[i]->is_nullable() != column_ptr->is_nullable()) { throw Exception(ErrorCode::INTERNAL_ERROR, "Nullable mismatch"); } - mutable_columns[i] = column_ptr->assume_mutable(); + mutable_columns[i] = IColumn::mutate(std::move(column_ptr)); } output_block->set_columns(std::move(mutable_columns)); diff --git a/be/src/exec/scan/scanner.h b/be/src/exec/scan/scanner.h index c14f6ee2048a7b..b7ff196a265097 100644 --- a/be/src/exec/scan/scanner.h +++ b/be/src/exec/scan/scanner.h @@ -115,8 +115,9 @@ class Scanner { if (_padding_block.empty()) { _padding_block.swap(_origin_block); } else if (_origin_block.rows()) { - RETURN_IF_ERROR( - MutableBlock::build_mutable_block(&_padding_block).merge(_origin_block)); + auto mutable_block = MutableBlock::build_mutable_block(&_padding_block); + RETURN_IF_ERROR(mutable_block.merge(_origin_block)); + _padding_block.set_columns(std::move(mutable_block.mutable_columns())); } return Status::OK(); } diff --git a/be/src/exec/sink/vtablet_block_convertor.cpp b/be/src/exec/sink/vtablet_block_convertor.cpp index b567b599cfa3bf..e59fa923375998 100644 --- a/be/src/exec/sink/vtablet_block_convertor.cpp +++ b/be/src/exec/sink/vtablet_block_convertor.cpp @@ -238,8 +238,8 @@ Status OlapTableBlockConvertor::_internal_validate_column(RuntimeState* state, B } } - const auto* tmp_column_ptr = check_and_get_column(*orig_column); - const auto& tmp_real_column_ptr = + auto tmp_column_ptr = check_and_get_column(*orig_column); + auto tmp_real_column_ptr = tmp_column_ptr == nullptr ? orig_column : (tmp_column_ptr->get_nested_column_ptr()); const auto* column_string = assert_cast(tmp_real_column_ptr.get()); const auto* null_map = @@ -281,13 +281,22 @@ Status OlapTableBlockConvertor::_internal_validate_column(RuntimeState* state, B {len_column, len_type, "len"}, {nullptr, input_type, "result"}}); RETURN_IF_ERROR(func->execute(nullptr, tmp_block, {0, 1, 2}, 3, row_count)); - column_string = - assert_cast(tmp_block.get_by_position(3).column.get()); - orig_column = - orig_column->is_nullable() - ? ColumnNullable::create(tmp_block.get_by_position(3).column, - tmp_column_ptr->get_null_map_column_ptr()) - : std::move(tmp_block.get_by_position(3).column); + auto result_column = + IColumn::mutate(std::move(tmp_block.get_by_position(3).column)); + if (orig_column->is_nullable()) { + orig_column = ColumnNullable::create( + std::move(result_column), + IColumn::mutate(tmp_column_ptr->get_null_map_column_ptr())); + } else { + orig_column = std::move(result_column); + } + tmp_column_ptr = check_and_get_column(*orig_column); + tmp_real_column_ptr = tmp_column_ptr == nullptr + ? orig_column + : tmp_column_ptr->get_nested_column_ptr(); + column_string = assert_cast(tmp_real_column_ptr.get()); + null_map = tmp_column_ptr == nullptr ? nullptr + : tmp_column_ptr->get_null_map_data().data(); } for (size_t j = 0; j < row_count; ++j) { auto row = rows ? (*rows)[j] : j; diff --git a/be/src/exec/sort/partition_sorter.cpp b/be/src/exec/sort/partition_sorter.cpp index 64422a202c236f..89be3b90dc6fb1 100644 --- a/be/src/exec/sort/partition_sorter.cpp +++ b/be/src/exec/sort/partition_sorter.cpp @@ -121,6 +121,7 @@ Status PartitionSorter::_read_row_num(Block* output_block, bool* eos, int batch_ if (current->impl->is_last(step) && current->impl->pos == 0) { if (merged_rows != 0) { // return directly for next time's read swap whole block + output_block->set_columns(std::move(merged_columns)); return Status::OK(); } // swap and return block directly when we should get all data from cursor @@ -147,6 +148,7 @@ Status PartitionSorter::_read_row_num(Block* output_block, bool* eos, int batch_ } } + output_block->set_columns(std::move(merged_columns)); return Status::OK(); } @@ -178,6 +180,7 @@ Status PartitionSorter::_read_row_rank(Block* output_block, bool* eos, int batch // rank() maybe need check when have get a distinct row // so when the cmp_res is get a distinct row, need check have output all rows num if (_get_enough_data()) { + output_block->set_columns(std::move(merged_columns)); return Status::OK(); } *_previous_row = *current; @@ -196,6 +199,7 @@ Status PartitionSorter::_read_row_rank(Block* output_block, bool* eos, int batch } } + output_block->set_columns(std::move(merged_columns)); return Status::OK(); } diff --git a/be/src/exec/sort/vsorted_run_merger.cpp b/be/src/exec/sort/vsorted_run_merger.cpp index ce4440c3178343..b4c142cd4f1287 100644 --- a/be/src/exec/sort/vsorted_run_merger.cpp +++ b/be/src/exec/sort/vsorted_run_merger.cpp @@ -194,6 +194,7 @@ Status VSortedRunMerger::get_next(Block* output_block, bool* eos) { current->next(); if (_need_more_data(current)) { do_insert(); + output_block->set_columns(std::move(merged_columns)); return Status::OK(); } } diff --git a/be/src/exprs/aggregate/aggregate_function_java_udaf.h b/be/src/exprs/aggregate/aggregate_function_java_udaf.h index 42b3bc87af6d9d..cbd929824d21d6 100644 --- a/be/src/exprs/aggregate/aggregate_function_java_udaf.h +++ b/be/src/exprs/aggregate/aggregate_function_java_udaf.h @@ -187,7 +187,8 @@ struct AggregateJavaUdafData { RETURN_NOT_OK_STATUS_WITH_WARN(Jni::Env::Get(&env), "Java-Udaf get value function"); Block output_block; - output_block.insert(ColumnWithTypeAndName(to.get_ptr(), result_type, "_result_")); + output_block.insert( + ColumnWithTypeAndName(result_type->create_column(), result_type, "_result_")); auto output_table_schema = JniDataBridge::parse_table_schema(&output_block); std::string output_nullable = result_type->is_nullable() ? "true" : "false"; std::map output_params = {{"is_nullable", output_nullable}, @@ -203,7 +204,11 @@ struct AggregateJavaUdafData { .with_arg(output_map) .call(&output_address)); - return JniDataBridge::fill_block(&output_block, {0}, output_address); + RETURN_IF_ERROR(JniDataBridge::fill_block(&output_block, {0}, output_address)); + const auto& result_column = output_block.get_by_position(0).column; + DORIS_CHECK(result_column->size() == 1); + to.insert_from(*result_column, 0); + return Status::OK(); } private: diff --git a/be/src/exprs/aggregate/aggregate_function_null_v2.h b/be/src/exprs/aggregate/aggregate_function_null_v2.h index aa2c9f3bb39792..a3b513d6014116 100644 --- a/be/src/exprs/aggregate/aggregate_function_null_v2.h +++ b/be/src/exprs/aggregate/aggregate_function_null_v2.h @@ -259,8 +259,7 @@ class AggregateFunctionNullBaseInlineV2 : public IAggregateFunctionHelperget_nested_column().assume_mutable().get(); + const IColumn* src_nested_column = &src_nullable_col->get_nested_column(); if (src_nullable_col->has_null()) { for (size_t i = 0; i < num_rows; ++i) { if (!src_null_map_data[i]) { diff --git a/be/src/exprs/aggregate/aggregate_function_sort.h b/be/src/exprs/aggregate/aggregate_function_sort.h index e001cb0c4c419d..2a7530e817fd3b 100644 --- a/be/src/exprs/aggregate/aggregate_function_sort.h +++ b/be/src/exprs/aggregate/aggregate_function_sort.h @@ -46,33 +46,27 @@ namespace doris { struct AggregateFunctionSortData { const SortDescription sort_desc; - Block block; + // The aggregate state is the sole owner of these columns and appends rows in add(), which is + // a hot path. Keep the long-lived state as MutableBlock and only materialize temporary Block + // views for APIs that require immutable Block input. + MutableBlock block; // The construct only support the template compiler, useless AggregateFunctionSortData() : sort_desc() {}; AggregateFunctionSortData(SortDescription sort_desc, const Block& block) : sort_desc(std::move(sort_desc)), block(block.clone_empty()) {} - void merge(const AggregateFunctionSortData& rhs) { - if (block.rows() == 0) { - block = rhs.block; - } else { - for (size_t i = 0; i < block.columns(); i++) { - auto column = block.get_by_position(i).column->assume_mutable(); - auto column_rhs = rhs.block.get_by_position(i).column; - column->insert_range_from(*column_rhs, 0, rhs.block.rows()); - } - } - } + void merge(const AggregateFunctionSortData& rhs) { append_block(rhs, 0, rhs.block.rows()); } void serialize(const RuntimeState* state, BufferWritable& buf) const { PBlock pblock; size_t uncompressed_bytes = 0; size_t compressed_bytes = 0; int64_t compressed_time = 0; - auto st = block.serialize(state->be_exec_version(), &pblock, &uncompressed_bytes, - &compressed_bytes, &compressed_time, - segment_v2::CompressionTypePB::NO_COMPRESSION); + auto block_view = to_block_view(); + auto st = block_view.serialize(state->be_exec_version(), &pblock, &uncompressed_bytes, + &compressed_bytes, &compressed_time, + segment_v2::CompressionTypePB::NO_COMPRESSION); if (!st.ok()) { throw doris::Exception(st); } @@ -88,12 +82,14 @@ struct AggregateFunctionSortData { pblock.ParseFromString(data); [[maybe_unused]] size_t uncompressed_size = 0; [[maybe_unused]] int64_t uncompressed_time = 0; - auto st = block.deserialize(pblock, &uncompressed_size, &uncompressed_time); + Block deserialized_block; + auto st = deserialized_block.deserialize(pblock, &uncompressed_size, &uncompressed_time); // If memory allocate failed during deserialize, st is not ok, throw exception here to // stop the query. if (!st.ok()) { throw doris::Exception(st); } + block = MutableBlock(std::move(deserialized_block)); } void add(const IColumn** columns, size_t columns_num, size_t row_num) { @@ -102,14 +98,40 @@ struct AggregateFunctionSortData { block.columns(), columns_num); for (size_t i = 0; i < columns_num; ++i) { - auto column = block.get_by_position(i).column->assume_mutable(); - column->insert_from(*columns[i], row_num); + block.get_column_by_position(i)->insert_from(*columns[i], row_num); } } void sort() { + auto block_view = to_block_view(); + auto sorted_block = block_view.clone_empty(); HybridSorter hybrid_sorter; - sort_block(block, block, sort_desc, hybrid_sorter, block.rows()); + sort_block(block_view, sorted_block, sort_desc, hybrid_sorter, block_view.rows()); + block = MutableBlock(std::move(sorted_block)); + } + +private: + void append_block(const AggregateFunctionSortData& rhs, size_t start, size_t length) { + DCHECK_EQ(block.columns(), rhs.block.columns()); + for (size_t i = 0; i < block.columns(); ++i) { + DCHECK(block.get_datatype_by_position(i)->equals( + *rhs.block.get_datatype_by_position(i))) + << "lhs type: " << block.get_datatype_by_position(i)->get_name() + << ", rhs type: " << rhs.block.get_datatype_by_position(i)->get_name(); + block.get_column_by_position(i)->insert_range_from(*rhs.block.get_column_by_position(i), + start, length); + } + } + + Block to_block_view() const { + ColumnsWithTypeAndName columns_with_schema; + columns_with_schema.reserve(block.columns()); + for (size_t i = 0; i < block.columns(); ++i) { + columns_with_schema.emplace_back( + static_cast(*block.get_column_by_position(i)).get_ptr(), + block.get_datatype_by_position(i), ""); + } + return {std::move(columns_with_schema)}; } }; @@ -177,7 +199,7 @@ class AggregateFunctionSort final ColumnRawPtrs arguments_nested; for (int i = 0; i < _arguments.size() - _sort_desc.size(); i++) { arguments_nested.emplace_back( - this->data(place).block.get_by_position(i).column.get()); + this->data(place).block.get_column_by_position(i).get()); } _nested_func->add_batch_single_place(arguments_nested[0]->size(), diff --git a/be/src/exprs/function/array/function_array_flatten.cpp b/be/src/exprs/function/array/function_array_flatten.cpp index 03086f37008788..3f76bcfb015e4a 100644 --- a/be/src/exprs/function/array/function_array_flatten.cpp +++ b/be/src/exprs/function/array/function_array_flatten.cpp @@ -55,23 +55,23 @@ class FunctionArrayFlatten : public IFunction { auto src_column = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); auto* src_column_array_ptr = - assert_cast(remove_nullable(src_column)->assume_mutable().get()); - ColumnArray* nested_src_column_array_ptr = src_column_array_ptr; + assert_cast(remove_nullable(src_column).get()); + const ColumnArray* nested_src_column_array_ptr = src_column_array_ptr; DataTypePtr src_data_type = block.get_by_position(arguments[0]).type; auto* src_data_type_array = assert_cast(remove_nullable(src_data_type).get()); - auto result_column_offsets = - assert_cast(src_column_array_ptr->get_offsets_column()) - .clone(); + auto result_column_offsets = assert_cast( + src_column_array_ptr->get_offsets_column()) + .clone(); auto* offsets = assert_cast(result_column_offsets.get()) ->get_data() .data(); while (src_data_type_array->get_nested_type()->get_primitive_type() == TYPE_ARRAY) { - nested_src_column_array_ptr = assert_cast( - remove_nullable(src_column_array_ptr->get_data_ptr())->assume_mutable().get()); + nested_src_column_array_ptr = assert_cast( + remove_nullable(src_column_array_ptr->get_data_ptr()).get()); for (size_t i = 0; i < input_rows_count; ++i) { offsets[i] = nested_src_column_array_ptr->get_offsets()[offsets[i] - 1]; diff --git a/be/src/exprs/function/cast/cast_to_variant.h b/be/src/exprs/function/cast/cast_to_variant.h index acc8ed9e7f6492..3aebb66212a44a 100644 --- a/be/src/exprs/function/cast/cast_to_variant.h +++ b/be/src/exprs/function/cast/cast_to_variant.h @@ -29,19 +29,34 @@ inline Status cast_from_variant_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, uint32_t result, size_t input_rows_count, const NullMap::value_type* null_map, const DataTypePtr& data_type_to) { - const auto& col_with_type_and_name = block.get_by_position(arguments[0]); - const auto& col_from = col_with_type_and_name.column; + auto& col_with_type_and_name = block.get_by_position(arguments[0]); + auto& col_from = col_with_type_and_name.column; const IColumn* variant_column = col_from.get(); if (const auto* nullable = check_and_get_column(*variant_column)) { variant_column = &nullable->get_nested_column(); } - const auto& variant = assert_cast(*variant_column); - ColumnPtr col_to = data_type_to->create_column(); - if (!variant.is_finalized()) { + if (!assert_cast(*variant_column).is_finalized()) { // ColumnVariant should be finalized before parsing, finalize maybe modify original column structure - variant.assume_mutable()->finalize(); + auto mutable_column = IColumn::mutate(std::move(col_with_type_and_name.column)); + if (auto* nullable = check_and_get_column(*mutable_column)) { + const auto& const_nullable = *nullable; + auto nested_column = IColumn::mutate(const_nullable.get_nested_column_ptr()); + assert_cast(*nested_column).finalize(); + ColumnPtr nested_column_ptr = std::move(nested_column); + nullable->change_nested_column(nested_column_ptr); + } else { + assert_cast(*mutable_column).finalize(); + } + col_with_type_and_name.column = std::move(mutable_column); + } + + variant_column = col_with_type_and_name.column.get(); + if (const auto* nullable = check_and_get_column(*variant_column)) { + variant_column = &nullable->get_nested_column(); } + const auto& variant = assert_cast(*variant_column); + ColumnPtr col_to = data_type_to->create_column(); // It's important to convert as many elements as possible in this context. For instance, // if the root of this variant column is a number column, converting it to a number column @@ -152,7 +167,7 @@ struct CastToVariant { auto variant = ColumnVariant::create( variant_type ? variant_type->variant_max_subcolumns_count() : 0, variant_type ? variant_type->enable_doc_mode() : false); - variant->create_root(from_type, col_from->assume_mutable()); + variant->create_root(from_type, IColumn::mutate(col_from)); block.replace_by_position(result, std::move(variant)); return Status::OK(); } @@ -186,4 +201,4 @@ WrapperType create_cast_from_variant_wrapper(const DataTypeVariant& from_type, }; } -} // namespace doris::CastWrapper \ No newline at end of file +} // namespace doris::CastWrapper diff --git a/be/src/exprs/function/function.cpp b/be/src/exprs/function/function.cpp index f1e44bb4c2ac40..590eb63829200d 100644 --- a/be/src/exprs/function/function.cpp +++ b/be/src/exprs/function/function.cpp @@ -67,8 +67,7 @@ ColumnPtr wrap_in_nullable(const ColumnPtr& src, const Block& block, const Colum } if (!mutable_result_null_map_column) { - mutable_result_null_map_column = - std::move(result_null_map_column)->assume_mutable(); + mutable_result_null_map_column = (*std::move(result_null_map_column)).mutate(); } NullMap& result_null_map = @@ -80,6 +79,12 @@ ColumnPtr wrap_in_nullable(const ColumnPtr& src, const Block& block, const Colum } } + // Commit merged null map back: result_null_map_column was moved into + // mutable_result_null_map_column when merging 2+ nullable args with nulls. + if (mutable_result_null_map_column) { + result_null_map_column = std::move(mutable_result_null_map_column); + } + if (!result_null_map_column) { if (is_column_const(*src)) { return ColumnConst::create( diff --git a/be/src/exprs/function/function_bitmap.cpp b/be/src/exprs/function/function_bitmap.cpp index 3f2c388efb85bf..35341f297640b0 100644 --- a/be/src/exprs/function/function_bitmap.cpp +++ b/be/src/exprs/function/function_bitmap.cpp @@ -681,11 +681,11 @@ void update_bitmap_op_count(int64_t* __restrict count, const NullMap& null_map) ColumnPtr handle_bitmap_op_count_null_value(ColumnPtr& src, const Block& block, const ColumnNumbers& args, uint32_t result, size_t input_rows_count) { - auto* nullable = assert_cast(src.get()); - ColumnPtr src_not_nullable = nullable->get_nested_column_ptr(); - MutableColumnPtr src_not_nullable_mutable = (*std::move(src_not_nullable)).assume_mutable(); + MutableColumnPtr mutable_src = IColumn::mutate(std::move(src)); + auto* nullable = assert_cast(mutable_src.get()); + auto* src_not_nullable_mutable = &nullable->get_nested_column(); auto* __restrict count_data = - assert_cast(src_not_nullable_mutable.get())->get_data().data(); + assert_cast(src_not_nullable_mutable)->get_data().data(); for (const auto& arg : args) { const ColumnWithTypeAndName& elem = block.get_by_position(arg); @@ -712,7 +712,7 @@ ColumnPtr handle_bitmap_op_count_null_value(ColumnPtr& src, const Block& block, } } - return src; + return mutable_src; } Status execute_bitmap_op_count_null_to_zero( diff --git a/be/src/exprs/function/function_variant_element.cpp b/be/src/exprs/function/function_variant_element.cpp index e407e595ffd461..cef269c613519f 100644 --- a/be/src/exprs/function/function_variant_element.cpp +++ b/be/src/exprs/function/function_variant_element.cpp @@ -148,8 +148,7 @@ class FunctionVariantElement : public IFunction { const auto& src_sparse_data_values = assert_cast(sparse_data_map.get_values()); auto& sparse_data_offsets = - assert_cast(*target_ptr->get_sparse_column()->assume_mutable()) - .get_offsets(); + assert_cast(target_ptr->get_sparse_column_mutable()).get_offsets(); auto [sparse_data_paths, sparse_data_values] = target_ptr->get_sparse_data_paths_and_values(); StringRef prefix_ref(path.get_path()); @@ -190,7 +189,7 @@ class FunctionVariantElement : public IFunction { sparse_data_offsets.push_back(sparse_data_paths->size()); } target_ptr->get_subcolumns().create_root(root); - target_ptr->get_doc_value_column()->assume_mutable()->resize(src_ptr->size()); + target_ptr->get_doc_value_column_mutable().resize(src_ptr->size()); target_ptr->set_num_rows(src_ptr->size()); } @@ -211,9 +210,9 @@ class FunctionVariantElement : public IFunction { // Ordinary Variant extraction keeps the selected prefix in sparse data, matching the // source branch behavior. Only doc-mode columns keep extracted data in doc_value. auto& extracted_offsets = - assert_cast(*(write_to_doc_value ? target_ptr->get_doc_value_column() - : target_ptr->get_sparse_column()) - ->assume_mutable()) + assert_cast(write_to_doc_value + ? target_ptr->get_doc_value_column_mutable() + : target_ptr->get_sparse_column_mutable()) .get_offsets(); auto [extracted_paths, extracted_values] = write_to_doc_value ? target_ptr->get_doc_value_data_paths_and_values() @@ -251,9 +250,9 @@ class FunctionVariantElement : public IFunction { } target_ptr->get_subcolumns().create_root(root); if (write_to_doc_value) { - target_ptr->get_sparse_column()->assume_mutable()->resize(src_ptr->size()); + target_ptr->get_sparse_column_mutable().resize(src_ptr->size()); } else { - target_ptr->get_doc_value_column()->assume_mutable()->resize(src_ptr->size()); + target_ptr->get_doc_value_column_mutable().resize(src_ptr->size()); } target_ptr->set_num_rows(src_ptr->size()); } @@ -323,7 +322,7 @@ class FunctionVariantElement : public IFunction { if (new_subcolumns.empty() && !nodes.empty()) { CHECK_EQ(nodes.size(), 1); new_subcolumns.create_root(ColumnVariant::Subcolumn { - nodes[0]->data.get_finalized_column_ptr()->assume_mutable(), + IColumn::mutate(nodes[0]->data.get_finalized_column_ptr()), nodes[0]->data.get_least_common_type(), true, true}); auto container = ColumnVariant::create(src.max_subcolumns_count(), src.enable_doc_mode(), @@ -349,12 +348,12 @@ class FunctionVariantElement : public IFunction { } result_col->insert_range_from(*container, 0, container->size()); } - *result = result_col->get_ptr(); // ColumnVariant should be finalized before parsing, finalize maybe modify original column structure - (*result)->assume_mutable()->finalize(); + result_col->finalize(); VLOG_DEBUG << "dump new object " << static_cast(result_col.get())->debug_string() << ", path " << path.get_path(); + *result = std::move(result_col); return Status::OK(); } } diff --git a/be/src/exprs/table_function/python_udtf_function.cpp b/be/src/exprs/table_function/python_udtf_function.cpp index f39ceafd98208c..4bcd8ae46364c5 100644 --- a/be/src/exprs/table_function/python_udtf_function.cpp +++ b/be/src/exprs/table_function/python_udtf_function.cpp @@ -260,8 +260,7 @@ Status PythonUDTFFunction::_convert_list_array_to_array_column( if (_return_type->is_nullable()) { nullable_col = assert_cast(array_col_ptr.get()); - array_col = assert_cast( - nullable_col->get_nested_column_ptr()->assume_mutable().get()); + array_col = assert_cast(&nullable_col->get_nested_column()); } else { array_col = assert_cast(array_col_ptr.get()); } @@ -274,8 +273,8 @@ Status PythonUDTFFunction::_convert_list_array_to_array_column( // Use read_column_from_arrow for optimized conversion // This directly converts Arrow ListArray to Doris ColumnArray // No struct unwrapping needed - Python server sends the correct format! - RETURN_IF_ERROR(array_serde->read_column_from_arrow( - array_col->assume_mutable_ref(), list_array.get(), 0, num_input_rows, _timezone_obj)); + RETURN_IF_ERROR(array_serde->read_column_from_arrow(*array_col, list_array.get(), 0, + num_input_rows, _timezone_obj)); // Handle nullable wrapper: all array elements are non-null // (empty arrays [] are non-null, different from NULL) diff --git a/be/src/exprs/table_function/udf_table_function.cpp b/be/src/exprs/table_function/udf_table_function.cpp index 4b6037f7ab1771..414766ef9157c3 100644 --- a/be/src/exprs/table_function/udf_table_function.cpp +++ b/be/src/exprs/table_function/udf_table_function.cpp @@ -123,10 +123,12 @@ Status UDFTableFunction::process_init(Block* block, RuntimeState* state) { .with_arg(output_map) .call(&output_address)); RETURN_IF_ERROR(JniDataBridge::fill_block(block, {_result_column_idx}, output_address)); + _array_result_column = + IColumn::mutate(std::move(block->get_by_position(_result_column_idx).column)); block->erase(_result_column_idx); if (!extract_column_array_info(*_array_result_column, _array_column_detail)) { return Status::NotSupported("column type {} not supported now", - block->get_by_position(_result_column_idx).column->get_name()); + _array_result_column->get_name()); } return Status::OK(); } diff --git a/be/src/exprs/table_function/vexplode.cpp b/be/src/exprs/table_function/vexplode.cpp index 680e5ccff66ed1..0b8556229a4ee3 100644 --- a/be/src/exprs/table_function/vexplode.cpp +++ b/be/src/exprs/table_function/vexplode.cpp @@ -45,7 +45,8 @@ Status VExplodeTableFunction::_process_init_variant(Block* block, int value_colu // explode variant array auto column_without_nullable = remove_nullable(block->get_by_position(value_column_idx).column); auto column = column_without_nullable->convert_to_full_column_if_const(); - auto& variant_column = assert_cast(*(column->assume_mutable())); + auto variant_column_ptr = IColumn::mutate(std::move(column)); + auto& variant_column = assert_cast(*variant_column_ptr); variant_column.finalize(); _detail.output_as_variant = true; _detail.variant_enable_doc_mode = variant_column.enable_doc_mode(); @@ -62,9 +63,10 @@ Status VExplodeTableFunction::_process_init_variant(Block* block, int value_colu _detail.nested_type = array_type->get_nested_type(); } else { // null root, use nothing type - _array_column = ColumnNullable::create(ColumnArray::create(ColumnNothing::create(0)), - ColumnUInt8::create(0)); - _array_column->assume_mutable()->insert_many_defaults(variant_column.size()); + auto array_column = ColumnNullable::create(ColumnArray::create(ColumnNothing::create(0)), + ColumnUInt8::create(0)); + array_column->insert_many_defaults(variant_column.size()); + _array_column = std::move(array_column); _detail.nested_type = std::make_shared(); } return Status::OK(); diff --git a/be/src/exprs/table_function/vexplode_v2.cpp b/be/src/exprs/table_function/vexplode_v2.cpp index b21802690a84b8..62a4ab1d66ae92 100644 --- a/be/src/exprs/table_function/vexplode_v2.cpp +++ b/be/src/exprs/table_function/vexplode_v2.cpp @@ -51,7 +51,8 @@ Status VExplodeV2TableFunction::_process_init_variant(Block* block, int value_co // explode variant array auto column_without_nullable = remove_nullable(block->get_by_position(value_column_idx).column); auto column = column_without_nullable->convert_to_full_column_if_const(); - auto& variant_column = assert_cast(*(column->assume_mutable())); + auto variant_column_ptr = IColumn::mutate(std::move(column)); + auto& variant_column = assert_cast(*variant_column_ptr); variant_column.finalize(); _multi_detail[children_column_idx].output_as_variant = true; _multi_detail[children_column_idx].variant_enable_doc_mode = variant_column.enable_doc_mode(); @@ -68,10 +69,10 @@ Status VExplodeV2TableFunction::_process_init_variant(Block* block, int value_co _multi_detail[children_column_idx].nested_type = array_type->get_nested_type(); } else { // null root, use nothing type - _array_columns[children_column_idx] = ColumnNullable::create( - ColumnArray::create(ColumnNothing::create(0)), ColumnUInt8::create(0)); - _array_columns[children_column_idx]->assume_mutable()->insert_many_defaults( - variant_column.size()); + auto array_column = ColumnNullable::create(ColumnArray::create(ColumnNothing::create(0)), + ColumnUInt8::create(0)); + array_column->insert_many_defaults(variant_column.size()); + _array_columns[children_column_idx] = std::move(array_column); _multi_detail[children_column_idx].nested_type = std::make_shared(); } return Status::OK(); diff --git a/be/src/exprs/vcase_expr.h b/be/src/exprs/vcase_expr.h index 382193276cad29..b8e274be82a7bb 100644 --- a/be/src/exprs/vcase_expr.h +++ b/be/src/exprs/vcase_expr.h @@ -217,9 +217,9 @@ class VCaseExpr final : public VExpr { if (!then_columns[i]) { continue; } - auto* __restrict column_raw_data = - assert_cast( - then_columns[i]->assume_mutable().get()) + const auto* __restrict column_raw_data = + assert_cast( + then_columns[i].get()) ->get_data() .data(); if constexpr (std::is_same_v || diff --git a/be/src/exprs/vcompound_pred.h b/be/src/exprs/vcompound_pred.h index 9f65060eba9b0b..e82fa04ba8e5a3 100644 --- a/be/src/exprs/vcompound_pred.h +++ b/be/src/exprs/vcompound_pred.h @@ -180,8 +180,8 @@ class VCompoundPred : public VectorizedFnCall { } ColumnPtr rhs_column = nullptr; - uint8_t* __restrict rhs_data_column = nullptr; - uint8_t* __restrict rhs_null_map = nullptr; + const uint8_t* __restrict rhs_data_column = nullptr; + const uint8_t* __restrict rhs_null_map = nullptr; bool rhs_is_nullable = false; bool rhs_all_true = false; bool rhs_all_false = false; @@ -216,31 +216,36 @@ class VCompoundPred : public VectorizedFnCall { }; auto create_null_map_column = [&](ColumnPtr& null_map_column, - uint8_t* __restrict null_map_data) { + const uint8_t* __restrict null_map_data) { if (null_map_data == nullptr) { null_map_column = ColumnUInt8::create(size, 0); - null_map_data = assert_cast(null_map_column->assume_mutable().get()) - ->get_data() - .data(); + null_map_data = + assert_cast(null_map_column.get())->get_data().data(); } return null_map_data; }; auto vector_vector = [&]() { + MutableColumnPtr mutable_result_column; + uint8_t* __restrict result_data_column = nullptr; + const uint8_t* __restrict other_data_column = rhs_data_column; if (lhs_column->use_count() == 1) { - result_column = lhs_column; + mutable_result_column = IColumn::mutate(std::move(lhs_column)); + result_data_column = + assert_cast(mutable_result_column.get())->get_data().data(); } else if (rhs_column->use_count() == 1) { - result_column = rhs_column; - auto tmp_column = rhs_data_column; - rhs_data_column = lhs_data_column; - lhs_data_column = tmp_column; + mutable_result_column = IColumn::mutate(std::move(rhs_column)); + result_data_column = + assert_cast(mutable_result_column.get())->get_data().data(); + other_data_column = lhs_data_column; } else { - auto col_res = lhs_column->clone_resized(size); - lhs_data_column = assert_cast(col_res.get())->get_data().data(); - result_column = std::move(col_res); + mutable_result_column = lhs_column->clone_resized(size); + result_data_column = + assert_cast(mutable_result_column.get())->get_data().data(); } - do_not_null_pred(lhs_data_column, rhs_data_column, size); + do_not_null_pred(result_data_column, other_data_column, size); + result_column = std::move(mutable_result_column); }; auto vector_vector_null = [&]() { auto col_res = ColumnUInt8::create(size); @@ -347,7 +352,8 @@ class VCompoundPred : public VectorizedFnCall { } template - void static do_not_null_pred(uint8_t* __restrict lhs, uint8_t* __restrict rhs, size_t size) { + void static do_not_null_pred(uint8_t* __restrict lhs, const uint8_t* __restrict rhs, + size_t size) { #ifdef NDEBUG #if defined(__clang__) #pragma clang loop vectorize(enable) @@ -365,8 +371,8 @@ class VCompoundPred : public VectorizedFnCall { } template - void static do_null_pred(uint8_t* __restrict lhs_data, uint8_t* __restrict lhs_null, - uint8_t* __restrict rhs_data, uint8_t* __restrict rhs_null, + void static do_null_pred(const uint8_t* __restrict lhs_data, const uint8_t* __restrict lhs_null, + const uint8_t* __restrict rhs_data, const uint8_t* __restrict rhs_null, uint8_t* __restrict res_data, uint8_t* __restrict res_null, size_t size) { #ifdef NDEBUG @@ -392,22 +398,21 @@ class VCompoundPred : public VectorizedFnCall { [](const VExprSPtr& arg) -> bool { return arg->is_constant(); }); } - std::pair _get_raw_data_and_null_map(ColumnPtr column, - bool has_nullable_column) const { + std::pair _get_raw_data_and_null_map( + const ColumnPtr& column, bool has_nullable_column) const { if (has_nullable_column) { - auto* nullable_column = assert_cast(column->assume_mutable().get()); + const auto* nullable_column = assert_cast(column.get()); auto* data_column = - assert_cast(nullable_column->get_nested_column_ptr().get()) - ->get_data() - .data(); - auto* null_map = - assert_cast(nullable_column->get_null_map_column_ptr().get()) + assert_cast(nullable_column->get_nested_column_ptr().get()) ->get_data() .data(); + auto* null_map = assert_cast( + nullable_column->get_null_map_column_ptr().get()) + ->get_data() + .data(); return std::make_pair(data_column, null_map); } else { - auto* data_column = - assert_cast(column->assume_mutable().get())->get_data().data(); + auto* data_column = assert_cast(column.get())->get_data().data(); return std::make_pair(data_column, nullptr); } } diff --git a/be/src/format/arrow/arrow_stream_reader.cpp b/be/src/format/arrow/arrow_stream_reader.cpp index b91608ee3fafa1..d5b53dff3306e5 100644 --- a/be/src/format/arrow/arrow_stream_reader.cpp +++ b/be/src/format/arrow/arrow_stream_reader.cpp @@ -113,7 +113,7 @@ Status ArrowStreamReader::_do_get_next_block(Block* block, size_t* read_rows, bo } RETURN_IF_ERROR(column_with_name.type->get_serde()->read_column_from_arrow( - column_with_name.column->assume_mutable_ref(), column, 0, num_rows, _ctzz)); + *columns[c], column, 0, num_rows, _ctzz)); } catch (Exception& e) { return Status::InternalError("Failed to convert from arrow to block: {}", e.what()); } @@ -121,6 +121,7 @@ Status ArrowStreamReader::_do_get_next_block(Block* block, size_t* read_rows, bo *read_rows += batch.num_rows(); } + block->set_columns(std::move(columns)); *eof = (*read_rows == 0); return Status::OK(); } diff --git a/be/src/format/column_type_convert.cpp b/be/src/format/column_type_convert.cpp index cd71ffb5babb33..b7a8388b5be771 100644 --- a/be/src/format/column_type_convert.cpp +++ b/be/src/format/column_type_convert.cpp @@ -117,10 +117,10 @@ ColumnPtr ColumnTypeConverter::get_column(const DataTypePtr& src_type, ColumnPtr _cached_src_column->assume_mutable()->clear(); if (dst_type->is_nullable()) { - // In order to share null map between parquet converted src column and dst column to avoid copying. It is very tricky that will - // call mutable function `doris_nullable_column->get_null_map_column_ptr()` which will set `_need_update_has_null = true`. - // Because some operations such as agg will call `has_null()` to set `_need_update_has_null = false`. - auto* doris_nullable_column = static_cast(dst_column.get()); + // Seed the source nullable wrapper with the destination's current null map. Under the + // assert-mutability COW contract ColumnNullable::create() mutates/clones the subcolumns, so + // readers that append file nulls must copy back only the newly appended null-map slice. + const auto* doris_nullable_column = static_cast(dst_column.get()); return ColumnNullable::create(_cached_src_column, doris_nullable_column->get_null_map_column_ptr()); } diff --git a/be/src/format/column_type_convert.h b/be/src/format/column_type_convert.h index 04003c098f0d30..554e5a0c3662a2 100644 --- a/be/src/format/column_type_convert.h +++ b/be/src/format/column_type_convert.h @@ -44,6 +44,20 @@ namespace doris::converter { enum FileFormat { COMMON, ORC, PARQUET }; +// Helper: get the inner (non-nullable) mutable column from an exclusively-owned dst_col. +// - For non-nullable dst_col: returns a raw pointer to the column itself. +// - For nullable dst_col: returns a raw pointer to the nested (non-null) column. +// Must only be called when dst_col has exclusive ownership (use_count == 1). +// Returns IColumn* (raw pointer) to avoid creating a second owning MutableColumnPtr, +// which would violate COW invariant (use_count > 1). +inline IColumn* get_mutable_inner_col(MutableColumnPtr& dst_col) { + if (dst_col->is_nullable()) { + return static_cast(dst_col.get())->get_nested_column_ptr().get(); + } else { + return dst_col.get(); + } +} + template constexpr bool is_decimal_type() { return type == TYPE_DECIMALV2 || type == TYPE_DECIMAL32 || type == TYPE_DECIMAL64 || @@ -165,13 +179,13 @@ class IntegerToIntegerConverter : public ColumnTypeConverter { using DstColumnType = typename PrimitiveTypeTraits::ColumnType; using DstCppType = typename PrimitiveTypeTraits::CppType; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); size_t rows = from_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = static_cast(*to_col.get()).get_data(); + auto& data = static_cast(*to_col).get_data(); for (int i = 0; i < rows; ++i) { if constexpr (sizeof(DstCppType) < sizeof(SrcCppType)) { SrcCppType src_value = src_data[i]; @@ -212,7 +226,7 @@ class NumericToFloatPointConverter : public ColumnTypeConverter { using DstColumnType = typename PrimitiveTypeTraits::ColumnType; using DstCppType = typename PrimitiveTypeTraits::CppType; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); NullMap* null_map = nullptr; if (dst_col->is_nullable()) { @@ -223,7 +237,7 @@ class NumericToFloatPointConverter : public ColumnTypeConverter { auto& src_data = static_cast(from_col.get())->get_data(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = static_cast(*to_col.get()).get_data(); + auto& data = static_cast(*to_col).get_data(); for (int i = 0; i < rows; ++i) { SrcCppType src_value = src_data[i]; if constexpr (is_integer_type()) { @@ -248,11 +262,11 @@ class BooleanToStringConverter : public ColumnTypeConverter { Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); size_t rows = from_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); - auto& string_col = static_cast(*to_col.get()); + auto& string_col = static_cast(*to_col); for (int i = 0; i < rows; ++i) { std::string value = src_data[i] != 0 ? "TRUE" : "FALSE"; string_col.insert_data(value.data(), value.size()); @@ -269,7 +283,7 @@ class NumericToStringConverter : public ColumnTypeConverter { Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); NullMap* null_map = nullptr; if (dst_col->is_nullable()) { @@ -279,7 +293,7 @@ class NumericToStringConverter : public ColumnTypeConverter { size_t rows = from_col->size(); size_t start_idx = to_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); - auto& string_col = static_cast(*to_col.get()); + auto& string_col = static_cast(*to_col); for (int i = 0; i < rows; ++i) { if constexpr (SrcPrimitiveType == TYPE_FLOAT || SrcPrimitiveType == TYPE_DOUBLE) { if (fileFormat == FileFormat::ORC && std::isnan(src_data[i])) { @@ -318,11 +332,11 @@ class DecimalToStringConverter : public ColumnTypeConverter { Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); size_t rows = from_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); - auto& string_col = static_cast(*to_col.get()); + auto& string_col = static_cast(*to_col); for (int i = 0; i < rows; ++i) { std::string value = src_data[i].to_string(_scale); string_col.insert_data(value.data(), value.size()); @@ -339,11 +353,11 @@ class TimeToStringConverter : public ColumnTypeConverter { using SrcCppType = typename PrimitiveTypeTraits::CppType; using SrcColumnType = typename PrimitiveTypeTraits::ColumnType; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); size_t rows = from_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); - auto& string_col = static_cast(*to_col.get()); + auto& string_col = static_cast(*to_col); char buf[50]; for (int i = 0; i < rows; ++i) { int len = (reinterpret_cast(src_data[i])).to_buffer(buf); @@ -571,19 +585,19 @@ class CastStringConverter : public ColumnTypeConverter { } NullMap* null_map = nullptr; - MutableColumnPtr to_col = nullptr; + IColumn* to_col = nullptr; if (dst_col->is_nullable()) { auto* nullable = assert_cast(dst_col.get()); - to_col = nullable->get_nested_column_ptr()->assume_mutable(); + to_col = nullable->get_nested_column_ptr().get(); null_map = &nullable->get_null_map_data(); } else { - to_col = dst_col->assume_mutable(); + to_col = dst_col.get(); } size_t rows = string_col->size(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = assert_cast(to_col.get())->get_data(); + auto& data = assert_cast(to_col)->get_data(); CastParameters params; for (int i = 0; i < rows; ++i) { bool can_cast = false; @@ -628,7 +642,7 @@ class DateTimeToNumericConverter : public ColumnTypeConverter { using DstCppType = typename PrimitiveTypeTraits::CppType; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); NullMap* null_map = nullptr; if (dst_col->is_nullable()) { @@ -639,7 +653,7 @@ class DateTimeToNumericConverter : public ColumnTypeConverter { const auto& src_data = static_cast(from_col.get())->get_data(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = static_cast(*to_col.get()).get_data(); + auto& data = static_cast(*to_col).get_data(); for (int i = 0; i < rows; ++i) { const SrcCppType& src_value = src_data[i]; @@ -680,13 +694,13 @@ class TimeV2Converter : public ColumnTypeConverter { using DstCppType = typename PrimitiveTypeTraits::CppType; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); size_t rows = from_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = static_cast(*to_col.get()).get_data(); + auto& data = static_cast(*to_col).get_data(); for (int i = 0; i < rows; ++i) { const auto& src_value = reinterpret_cast(src_data[i]); auto& dst_value = reinterpret_cast(data[start_idx + i]); @@ -718,7 +732,7 @@ class NumericToDecimalConverter : public ColumnTypeConverter { using DstDorisType = typename PrimitiveTypeTraits::ColumnType::value_type; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); NullMap* null_map = nullptr; if (dst_col->is_nullable()) { @@ -729,7 +743,7 @@ class NumericToDecimalConverter : public ColumnTypeConverter { auto& src_data = static_cast(from_col.get())->get_data(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = static_cast(*to_col.get()).get_data(); + auto& data = static_cast(*to_col).get_data(); auto max_result = DataTypeDecimal::get_max_digits_number(_precision); auto multiplier = DataTypeDecimal::get_scale_multiplier(_scale); @@ -804,13 +818,13 @@ class DecimalToNumericConverter : public ColumnTypeConverter { using DstCppType = typename PrimitiveTypeTraits::CppType; ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); size_t rows = from_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = static_cast(*to_col.get()).get_data(); + auto& data = static_cast(*to_col).get_data(); NullMap* null_map = nullptr; if (dst_col->is_nullable()) { @@ -889,13 +903,13 @@ class DecimalToDecimalConverter : public ColumnTypeConverter { bool narrow_integral = (_to_precision - _to_scale) < (_from_precision - _from_scale); ColumnPtr from_col = remove_nullable(src_col); - MutableColumnPtr to_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + IColumn* to_col = get_mutable_inner_col(dst_col); size_t rows = from_col->size(); auto& src_data = static_cast(from_col.get())->get_data(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = static_cast(*to_col.get()).get_data(); + auto& data = static_cast(*to_col).get_data(); for (int i = 0; i < rows; ++i) { SrcNativeType src_value = src_data[i].value; @@ -983,15 +997,15 @@ class VarBinaryConverter : public ColumnTypeConverter { from_col = &assert_cast(*src_col); } - MutableColumnPtr to_col = nullptr; + IColumn* to_col = nullptr; // nullmap flag seems have been handled in upper level if (dst_col->is_nullable()) { const auto* nullable = assert_cast(dst_col.get()); - to_col = nullable->get_nested_column_ptr()->assume_mutable(); + to_col = const_cast(nullable)->get_nested_column_ptr().get(); } else { - to_col = dst_col->assume_mutable(); + to_col = dst_col.get(); } - auto* to_dst_column = assert_cast(to_col.get()); + auto* to_dst_column = assert_cast(to_col); for (size_t i = 0; i < from_col->size(); ++i) { auto string_ref = from_col->get_data_at(i); diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index 539132c7c9f003..90340afafe1739 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -65,6 +65,19 @@ enum class FileCachePolicy : uint8_t; namespace doris { +namespace { + +size_t columns_byte_size(const std::vector& columns) { + size_t bytes = 0; + for (const auto& column : columns) { + DCHECK(column.get() != nullptr); + bytes += column->byte_size(); + } + return bytes; +} + +} // namespace + void EncloseCsvTextFieldSplitter::do_split(const Slice& line, std::vector* splitted_values) { const char* data = line.data; const auto& column_sep_positions = _text_line_reader_ctx->column_sep_positions(); @@ -437,7 +450,8 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) block->set_columns(std::move(mutate_columns)); } else { auto columns = block->mutate_columns(); - while (rows < batch_size && !_line_reader_eof && (block->bytes() < max_block_bytes)) { + while (rows < batch_size && !_line_reader_eof && + (columns_byte_size(columns) < max_block_bytes)) { const uint8_t* ptr = nullptr; size_t size = 0; RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); @@ -457,7 +471,7 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) } if (size == 0) { if (!_line_reader_eof && _state->is_read_csv_empty_line_as_null()) { - RETURN_IF_ERROR(_fill_empty_line(block, columns, &rows)); + RETURN_IF_ERROR(_fill_empty_line(columns, &rows)); } // Read empty line, continue continue; @@ -467,7 +481,7 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) if (!success) { continue; } - RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), block, columns, &rows)); + RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), columns, &rows)); } block->set_columns(std::move(columns)); } @@ -719,8 +733,8 @@ Status CsvReader::_deserialize_one_cell(DataTypeSerDeSPtr serde, IColumn& column return serde->deserialize_one_cell_from_csv(column, slice, _options); } -Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, - std::vector& columns, size_t* rows) { +Status CsvReader::_fill_dest_columns(const Slice& line, std::vector& columns, + size_t* rows) { bool is_success = false; RETURN_IF_ERROR(_line_split_to_values(line, &is_success)); @@ -738,10 +752,7 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, IColumn* col_ptr = columns[i].get(); if (!_is_load) { - // block is a Block*, and get_by_position returns a ColumnPtr, - // which is a const pointer. Therefore, using const_cast is permissible. - col_ptr = const_cast( - block->get_by_position(_file_slot_idx_map[i]).column.get()); + col_ptr = columns[_file_slot_idx_map[i]].get(); } if (_use_nullable_string_opt[i]) { @@ -758,15 +769,11 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, return Status::OK(); } -Status CsvReader::_fill_empty_line(Block* block, std::vector& columns, - size_t* rows) { +Status CsvReader::_fill_empty_line(std::vector& columns, size_t* rows) { for (int i = 0; i < _file_slot_descs.size(); ++i) { IColumn* col_ptr = columns[i].get(); if (!_is_load) { - // block is a Block*, and get_by_position returns a ColumnPtr, - // which is a const pointer. Therefore, using const_cast is permissible. - col_ptr = const_cast( - block->get_by_position(_file_slot_idx_map[i]).column.get()); + col_ptr = columns[_file_slot_idx_map[i]].get(); } auto& null_column = assert_cast(*col_ptr); null_column.insert_data(nullptr, 0); diff --git a/be/src/format/csv/csv_reader.h b/be/src/format/csv/csv_reader.h index 077f089e5e9a18..f619ce4d4a85e5 100644 --- a/be/src/format/csv/csv_reader.h +++ b/be/src/format/csv/csv_reader.h @@ -232,9 +232,9 @@ class CsvReader : public TableFormatReader { private: Status _create_decompressor(); Status _create_file_reader(bool need_schema); - Status _fill_dest_columns(const Slice& line, Block* block, - std::vector& columns, size_t* rows); - Status _fill_empty_line(Block* block, std::vector& columns, size_t* rows); + Status _fill_dest_columns(const Slice& line, std::vector& columns, + size_t* rows); + Status _fill_empty_line(std::vector& columns, size_t* rows); Status _line_split_to_values(const Slice& line, bool* success); void _split_line(const Slice& line); void _init_system_properties(); diff --git a/be/src/format/jni/jni_data_bridge.cpp b/be/src/format/jni/jni_data_bridge.cpp index 7f66e3cefc5cf5..4d42574075a662 100644 --- a/be/src/format/jni/jni_data_bridge.cpp +++ b/be/src/format/jni/jni_data_bridge.cpp @@ -105,24 +105,27 @@ Status JniDataBridge::fill_column(TableMetaAddress& address, ColumnPtr& doris_co // org.apache.doris.common.jni.vec.ColumnType.Type#UNSUPPORTED will set column address as 0 return Status::InternalError("Unsupported type {} in java side", data_type->get_name()); } + auto mutable_doris_column = IColumn::mutate(std::move(doris_column)); MutableColumnPtr data_column; - if (doris_column->is_nullable()) { - auto* nullable_column = - reinterpret_cast(doris_column->assume_mutable().get()); + if (mutable_doris_column->is_nullable()) { + auto* nullable_column = assert_cast(mutable_doris_column.get()); data_column = nullable_column->get_nested_column_ptr(); NullMap& null_map = nullable_column->get_null_map_data(); size_t origin_size = null_map.size(); null_map.resize(origin_size + num_rows); memcpy(null_map.data() + origin_size, static_cast(null_map_ptr), num_rows); } else { - data_column = doris_column->assume_mutable(); + data_column = mutable_doris_column->get_ptr(); } // Date and DateTime are deprecated and not supported. + Status status = Status::OK(); switch (logical_type) { -#define DISPATCH(TYPE_INDEX, COLUMN_TYPE, CPP_TYPE) \ - case TYPE_INDEX: \ - return _fill_fixed_length_column( \ - data_column, reinterpret_cast(address.next_meta_as_ptr()), num_rows); +#define DISPATCH(TYPE_INDEX, COLUMN_TYPE, CPP_TYPE) \ + case TYPE_INDEX: { \ + auto* data = reinterpret_cast(address.next_meta_as_ptr()); \ + status = _fill_fixed_length_column(data_column, data, num_rows); \ + break; \ + } FOR_FIXED_LENGTH_TYPES(DISPATCH) #undef DISPATCH case PrimitiveType::TYPE_STRING: @@ -130,19 +133,27 @@ Status JniDataBridge::fill_column(TableMetaAddress& address, ColumnPtr& doris_co case PrimitiveType::TYPE_CHAR: [[fallthrough]]; case PrimitiveType::TYPE_VARCHAR: - return _fill_string_column(address, data_column, num_rows); + status = _fill_string_column(address, data_column, num_rows); + break; case PrimitiveType::TYPE_ARRAY: - return _fill_array_column(address, data_column, data_type, num_rows); + status = _fill_array_column(address, data_column, data_type, num_rows); + break; case PrimitiveType::TYPE_MAP: - return _fill_map_column(address, data_column, data_type, num_rows); + status = _fill_map_column(address, data_column, data_type, num_rows); + break; case PrimitiveType::TYPE_STRUCT: - return _fill_struct_column(address, data_column, data_type, num_rows); + status = _fill_struct_column(address, data_column, data_type, num_rows); + break; case PrimitiveType::TYPE_VARBINARY: - return _fill_varbinary_column(address, data_column, num_rows); + status = _fill_varbinary_column(address, data_column, num_rows); + break; default: - return Status::InvalidArgument("Unsupported type {} in jni scanner", data_type->get_name()); + status = Status::InvalidArgument("Unsupported type {} in jni scanner", + data_type->get_name()); + break; } - return Status::OK(); + doris_column = std::move(mutable_doris_column); + return status; } Status JniDataBridge::_fill_varbinary_column(TableMetaAddress& address, diff --git a/be/src/format/json/new_json_reader.cpp b/be/src/format/json/new_json_reader.cpp index da141437fcf200..90a4bd65b22813 100644 --- a/be/src/format/json/new_json_reader.cpp +++ b/be/src/format/json/new_json_reader.cpp @@ -452,17 +452,38 @@ Status NewJsonReader::_get_range_params() { return Status::OK(); } -static Status ignore_malformed_json_append_null(Block& block) { - for (auto& column : block.get_columns()) { - if (!column->is_nullable()) [[unlikely]] { +Status json_reader_detail::append_null_for_malformed_json(Block& block) { + for (int i = 0; i < block.columns(); ++i) { + auto& column_with_type = block.get_by_position(i); + if (!column_with_type.column->is_nullable()) [[unlikely]] { return Status::DataQualityError("malformed json, but the column `{}` is not nullable.", - column->get_name()); + column_with_type.column->get_name()); } - static_cast(column->assume_mutable().get())->insert_default(); + auto column = IColumn::mutate(std::move(column_with_type.column)); + assert_cast(column.get())->insert_default(); + column_with_type.column = std::move(column); } return Status::OK(); } +void json_reader_detail::truncate_block_to_rows(Block& block, size_t num_rows) { + for (int i = 0; i < block.columns(); ++i) { + auto& column_with_type = block.get_by_position(i); + auto column = IColumn::mutate(std::move(column_with_type.column)); + if (column->size() > num_rows) { + column->pop_back(column->size() - num_rows); + } + column_with_type.column = std::move(column); + } +} + +void json_reader_detail::pop_back_last_inserted_value(Block& block, size_t column_index) { + auto& column = block.get_by_position(column_index).column; + auto mutable_column = IColumn::mutate(std::move(column)); + mutable_column->pop_back(1); + column = std::move(mutable_column); +} + Status NewJsonReader::_open_file_reader(bool need_schema) { int64_t start_offset = _range.start_offset; if (start_offset != 0) { @@ -678,12 +699,7 @@ Status NewJsonReader::_handle_simdjson_error(simdjson::simdjson_error& error, Bl error.what()); _counter->num_rows_filtered++; // Before continuing to process other rows, we need to first clean the fail parsed row. - for (int i = 0; i < block.columns(); ++i) { - auto column = block.get_by_position(i).column->assume_mutable(); - if (column->size() > num_rows) { - column->pop_back(column->size() - num_rows); - } - } + json_reader_detail::truncate_block_to_rows(block, num_rows); RETURN_IF_ERROR(_state->append_error_msg_to_file( [&]() -> std::string { @@ -714,7 +730,7 @@ Status NewJsonReader::_simdjson_handle_simple_json(RuntimeState* /*state*/, Bloc if (_is_load) { return Status::OK(); } else if (_openx_json_ignore_malformed) { - RETURN_IF_ERROR(ignore_malformed_json_append_null(block)); + RETURN_IF_ERROR(json_reader_detail::append_null_for_malformed_json(block)); return Status::OK(); } } @@ -934,12 +950,7 @@ Status NewJsonReader::_simdjson_handle_nested_complex_json( if (!st.ok()) { RETURN_IF_ERROR(_append_error_msg(nullptr, st.to_string(), "", nullptr)); // Before continuing to process other rows, we need to first clean the fail parsed row. - for (int i = 0; i < block.columns(); ++i) { - auto column = block.get_by_position(i).column->assume_mutable(); - if (column->size() > num_rows) { - column->pop_back(column->size() - num_rows); - } - } + json_reader_detail::truncate_block_to_rows(block, num_rows); continue; } if (!valid) { @@ -1009,7 +1020,7 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val if (_is_hive_table) { //Since value can only be traversed once, // we can only insert the original value first, then delete it, and then reinsert the new value - block.get_by_position(column_index).column->assume_mutable()->pop_back(1); + json_reader_detail::pop_back_last_inserted_value(block, column_index); } else { continue; } @@ -1075,14 +1086,7 @@ Status NewJsonReader::_simdjson_set_column_value(simdjson::ondemand::object* val "partial update, missing key column: {}", slot_desc->col_name(), valid)); // remove this line in block - for (size_t index = 0; index < block.columns(); ++index) { - auto column = block.get_by_position(index).column->assume_mutable(); - if (column->size() != cur_row_count) { - DCHECK(column->size() == cur_row_count + 1); - column->pop_back(1); - DCHECK(column->size() == cur_row_count); - } - } + json_reader_detail::truncate_block_to_rows(block, cur_row_count); return Status::OK(); } _set_skip_bitmap_mark(slot_desc, column_ptr, block, cur_row_count, valid); @@ -1542,10 +1546,8 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath( // there is no valid value in json line but has filled with default value before // so remove this line in block std::string col_names; - for (int i = 0; i < block.columns(); ++i) { - auto column = block.get_by_position(i).column->assume_mutable(); - column->pop_back(1); - } + DCHECK(block.rows() > 0); + json_reader_detail::truncate_block_to_rows(block, block.rows() - 1); for (auto* slot_desc : slot_descs) { col_names.append(slot_desc->col_name() + ", "); } diff --git a/be/src/format/json/new_json_reader.h b/be/src/format/json/new_json_reader.h index e74607a0e6de56..b975433c34f0f8 100644 --- a/be/src/format/json/new_json_reader.h +++ b/be/src/format/json/new_json_reader.h @@ -62,6 +62,12 @@ struct ScannerCounter; class Block; class IColumn; +namespace json_reader_detail { +Status append_null_for_malformed_json(Block& block); +void truncate_block_to_rows(Block& block, size_t num_rows); +void pop_back_last_inserted_value(Block& block, size_t column_index); +} // namespace json_reader_detail + /// JSON-specific initialization context. /// Extends ReaderInitContext with default value context (unique to JSON reader). struct JsonInitContext final : public ReaderInitContext { diff --git a/be/src/format/lance/lance_rust_reader.cpp b/be/src/format/lance/lance_rust_reader.cpp index 166bbd52dcc519..2eed2356734ca3 100644 --- a/be/src/format/lance/lance_rust_reader.cpp +++ b/be/src/format/lance/lance_rust_reader.cpp @@ -230,6 +230,7 @@ Status LanceRustReader::_do_get_next_block(Block* block, size_t* read_rows, bool const auto num_columns = record_batch->num_columns(); // Convert Arrow columns to Doris Block columns (same pattern as PaimonCppReader) + auto columns = block->mutate_columns(); for (int c = 0; c < num_columns; ++c) { const auto& field = record_batch->schema()->field(c); @@ -238,16 +239,17 @@ Status LanceRustReader::_do_get_next_block(Block* block, size_t* read_rows, bool continue; } - const ColumnWithTypeAndName& column_with_name = block->get_by_position(it->second); + const auto block_pos = it->second; + const ColumnWithTypeAndName& column_with_name = block->get_by_position(block_pos); try { RETURN_IF_ERROR(column_with_name.type->get_serde()->read_column_from_arrow( - column_with_name.column->assume_mutable_ref(), record_batch->column(c).get(), 0, - num_rows, _ctzz)); + *columns[block_pos], record_batch->column(c).get(), 0, num_rows, _ctzz)); } catch (Exception& e) { return Status::InternalError("Failed to convert Lance arrow to block: {}", e.what()); } } + block->set_columns(std::move(columns)); *read_rows = num_rows; *eof = false; return Status::OK(); diff --git a/be/src/format/orc/vorc_reader.cpp b/be/src/format/orc/vorc_reader.cpp index 06ffe6302dac60..afc8b09933f9dc 100644 --- a/be/src/format/orc/vorc_reader.cpp +++ b/be/src/format/orc/vorc_reader.cpp @@ -116,6 +116,40 @@ namespace doris { // TODO: we need to determine it by test. static constexpr uint32_t MAX_DICT_CODE_PREDICATE_TO_REWRITE = std::numeric_limits::max(); static constexpr char EMPTY_STRING_FOR_OVERFLOW[ColumnString::MAX_STRINGS_OVERFLOW_SIZE] = ""; + +static void fill_orc_null_map(ColumnNullable* nullable_column, const orc::ColumnVectorBatch* cvb, + size_t num_values) { + NullMap& map_data_column = nullable_column->get_null_map_data(); + const auto origin_size = map_data_column.size(); + map_data_column.resize(origin_size + num_values); + if (cvb->hasNulls) { + const auto* cvb_nulls = cvb->notNull.data(); + for (int i = 0; i < num_values; ++i) { + map_data_column[origin_size + i] = !cvb_nulls[i]; + } + } else { + memset(map_data_column.data() + origin_size, 0, num_values); + } +} + +static void align_orc_null_map(const ColumnPtr& src_column, ColumnNullable* dst_nullable_column, + size_t src_null_map_start, size_t new_rows) { + auto& dst_null_map = dst_nullable_column->get_null_map_column(); + const size_t old_rows = dst_nullable_column->get_nested_column().size(); + const size_t expected_rows = old_rows + new_rows; + if (dst_null_map.size() == expected_rows) { + return; + } + DCHECK_EQ(dst_null_map.size(), old_rows); + if (src_column->is_nullable()) { + const auto* src_nullable = assert_cast(src_column.get()); + DCHECK_GE(src_nullable->get_null_map_column().size(), src_null_map_start + new_rows); + dst_null_map.insert_range_from(src_nullable->get_null_map_column(), src_null_map_start, + new_rows); + } else { + dst_null_map.insert_many_vals(0, new_rows); + } +} // Because HIVE 0.11 & 0.12 does not support precision and scale for decimal // The decimal type of orc file produced by HIVE 0.11 & 0.12 are DECIMAL(0,0) // We should set a default precision and scale for these orc files. @@ -2018,13 +2052,14 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, // Handle key column: if still missing, fill with default values if (key_is_missing) { // Fill key column with default values (nulls or empty values) - auto mutable_key_column = doris_key_column->assume_mutable(); + auto mutable_key_column = IColumn::mutate(std::move(doris_key_column)); if (mutable_key_column->is_nullable()) { auto* nullable_column = static_cast(mutable_key_column.get()); nullable_column->insert_many_defaults(element_size); } else { mutable_key_column->insert_many_defaults(element_size); } + doris_key_column = std::move(mutable_key_column); } else { // Normal processing: convert ORC column to Doris column RETURN_IF_ERROR(_orc_column_to_doris_column( @@ -2035,13 +2070,14 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, // Handle value column: if still missing, fill with default values if (value_is_missing) { // Fill value column with default values (nulls or empty values) - auto mutable_value_column = doris_value_column->assume_mutable(); + auto mutable_value_column = IColumn::mutate(std::move(doris_value_column)); if (mutable_value_column->is_nullable()) { auto* nullable_column = static_cast(mutable_value_column.get()); nullable_column->insert_many_defaults(element_size); } else { mutable_value_column->insert_many_defaults(element_size); } + doris_value_column = std::move(mutable_value_column); } else { // Normal processing: convert ORC column to Doris column RETURN_IF_ERROR(_orc_column_to_doris_column( @@ -2106,8 +2142,10 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, "Child field of '{}' is not nullable, but is missing in orc file", col_name); } - reinterpret_cast(doris_field->assume_mutable().get()) + auto mutable_field = IColumn::mutate(std::move(doris_field)); + reinterpret_cast(mutable_field.get()) ->insert_many_defaults(num_values); + doris_field = std::move(mutable_field); } for (auto read_field : read_fields) { @@ -2172,45 +2210,64 @@ Status OrcReader::_orc_column_to_doris_column( resolved_column = converter->get_column(src_type, doris_column, data_type); resolved_type = converter->get_type(); - if (resolved_column->is_nullable()) { + MutableColumnPtr mutable_resolved_column; + if (converter->is_consistent()) { + resolved_column.reset(); + mutable_resolved_column = IColumn::mutate(std::move(doris_column)); + } else { + mutable_resolved_column = IColumn::mutate(std::move(resolved_column)); + } + + size_t src_null_map_start = 0; + if (mutable_resolved_column->is_nullable()) { SCOPED_RAW_TIMER(&_statistics.decode_null_map_time); auto* nullable_column = - reinterpret_cast(resolved_column->assume_mutable().get()); + reinterpret_cast(mutable_resolved_column.get()); data_column = nullable_column->get_nested_column_ptr(); - - NullMap& map_data_column = nullable_column->get_null_map_data(); - auto origin_size = map_data_column.size(); - map_data_column.resize(origin_size + num_values); - if (cvb->hasNulls) { - const auto* cvb_nulls = cvb->notNull.data(); - for (int i = 0; i < num_values; ++i) { - map_data_column[origin_size + i] = !cvb_nulls[i]; - } - } else { - memset(map_data_column.data() + origin_size, 0, num_values); - } + src_null_map_start = nullable_column->get_null_map_column().size(); + fill_orc_null_map(nullable_column, cvb, num_values); } else { if (cvb->hasNulls) { return Status::InternalError("Not nullable column {} has null values in orc file", col_name); } - data_column = resolved_column->assume_mutable(); + data_column = std::move(mutable_resolved_column); } RETURN_IF_ERROR(_fill_doris_data_column( col_name, data_column, remove_nullable(resolved_type), root_node, orc_column_type, cvb, num_values)); - // resolve schema change + + if (mutable_resolved_column) { + data_column.reset(); + resolved_column = std::move(mutable_resolved_column); + } else { + resolved_column = std::move(data_column); + } + + if (converter->is_consistent()) { + doris_column = std::move(resolved_column); + return Status::OK(); + } + + doris_column = IColumn::mutate(std::move(doris_column)); auto converted_column = doris_column->assume_mutable(); + if (converted_column->is_nullable()) { + const size_t new_rows = remove_nullable(resolved_column)->size(); + align_orc_null_map(resolved_column, + reinterpret_cast(converted_column.get()), + src_null_map_start, new_rows); + } return converter->convert(resolved_column, converted_column); } else { - auto mutable_column = doris_column->assume_mutable(); + auto mutable_column = IColumn::mutate(std::move(doris_column)); if (mutable_column->is_nullable()) { auto* nullable_column = static_cast(mutable_column.get()); nullable_column->insert_many_defaults(num_values); } else { mutable_column->insert_many_defaults(num_values); } + doris_column = std::move(mutable_column); } return Status::OK(); @@ -2628,9 +2685,7 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo } if (can_filter_all) { - for (auto& col : columns_to_filter) { - std::move(*block->get_by_position(col).column).assume_mutable()->clear(); - } + block->clear_column_data(columns_to_filter); Block::erase_useless_column(block, column_to_keep); return _convert_dict_cols_to_string_cols(block, &batch_vec); } @@ -2802,7 +2857,9 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s if (_lazy_read_ctx.resize_first_column) { // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0 // The following process may be tricky and time-consuming, but we have no other way. - block->get_by_position(0).column->assume_mutable()->resize(size); + auto column = IColumn::mutate(std::move(block->get_by_position(0).column)); + column->resize(size); + block->replace_by_position(0, std::move(column)); } // transactional hive orc delete row @@ -2829,26 +2886,25 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s if (_lazy_read_ctx.resize_first_column) { // We have to clean the first column to insert right data. - block->get_by_position(0).column->assume_mutable()->clear(); + block->clear_column_data(std::vector {0}); } if (can_filter_all) { + std::vector columns_to_clear; + columns_to_clear.reserve(table_col_names.size() + + _lazy_read_ctx.predicate_partition_columns.size() + + _lazy_read_ctx.predicate_missing_columns.size()); for (auto& col : table_col_names) { // clean block to read predicate columns and acid columns - block->get_by_position((*_col_name_to_block_idx)[col]) - .column->assume_mutable() - ->clear(); + columns_to_clear.emplace_back((*_col_name_to_block_idx)[col]); } for (auto& col : _lazy_read_ctx.predicate_partition_columns) { - block->get_by_position((*_col_name_to_block_idx)[col.first]) - .column->assume_mutable() - ->clear(); + columns_to_clear.emplace_back((*_col_name_to_block_idx)[col.first]); } for (auto& col : _lazy_read_ctx.predicate_missing_columns) { - block->get_by_position((*_col_name_to_block_idx)[col.first]) - .column->assume_mutable() - ->clear(); + columns_to_clear.emplace_back((*_col_name_to_block_idx)[col.first]); } + block->clear_column_data(columns_to_clear); Block::erase_useless_column(block, origin_column_num); RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block, nullptr)); } diff --git a/be/src/format/parquet/parquet_column_convert.h b/be/src/format/parquet/parquet_column_convert.h index f56ad295bab968..fab6e5e98bf60e 100644 --- a/be/src/format/parquet/parquet_column_convert.h +++ b/be/src/format/parquet/parquet_column_convert.h @@ -194,6 +194,65 @@ struct ConvertParams { } }; +inline IColumn* get_mutable_inner_column(ColumnPtr& column) { + column = IColumn::mutate(std::move(column)); + auto mutable_column = column->assume_mutable(); + if (mutable_column->is_nullable()) { + return &assert_cast(mutable_column.get())->get_nested_column(); + } + return mutable_column.get(); +} + +inline size_t get_mutable_inner_column_size(const ColumnPtr& column) { + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + return nullable->get_nested_column().size(); + } + return column->size(); +} + +inline size_t get_null_map_size_or_inner_column_size(const ColumnPtr& column) { + if (column->is_nullable()) { + const auto* nullable = assert_cast(column.get()); + return nullable->get_null_map_column().size(); + } + return column->size(); +} + +inline size_t get_appended_null_map_start(const ColumnPtr& column, size_t new_rows) { + if (!column->is_nullable()) { + return 0; + } + const auto* nullable = assert_cast(column.get()); + const size_t null_map_size = nullable->get_null_map_column().size(); + DCHECK_GE(null_map_size, new_rows); + return null_map_size - new_rows; +} + +inline void align_null_map(ColumnPtr& src_column, ColumnPtr& dst_column, size_t old_null_map_size, + size_t new_rows, size_t src_null_map_start = 0) { + if (!dst_column->is_nullable()) { + return; + } + + dst_column = IColumn::mutate(std::move(dst_column)); + auto* dst_nullable = assert_cast(dst_column->assume_mutable().get()); + auto& dst_null_map = dst_nullable->get_null_map_column(); + const size_t expected_rows = old_null_map_size + new_rows; + if (dst_null_map.size() == expected_rows) { + return; + } + DCHECK_EQ(dst_null_map.size(), old_null_map_size); + if (src_column->is_nullable()) { + const auto* src_nullable = assert_cast(src_column.get()); + DCHECK_GE(src_nullable->get_null_map_column().size(), src_null_map_start + new_rows); + dst_null_map.insert_range_from(src_nullable->get_null_map_column(), src_null_map_start, + new_rows); + } else { + dst_null_map.insert_many_vals(0, new_rows); + } +} + /** * Convert parquet physical column to logical column * In parquet document(https://github.com/apache/parquet-format/blob/master/LogicalTypes.md), @@ -254,26 +313,46 @@ class PhysicalToLogicalConverter { PrimitiveType::TYPE_INT, dst_logical_type->is_nullable()); } if (is_consistent() && _logical_converter->is_consistent()) { + dst_logical_col = std::move(src_physical_col); return Status::OK(); } + if (_logical_converter->is_consistent()) { + const size_t old_rows = get_mutable_inner_column_size(dst_logical_col); + const size_t old_null_map_size = + get_null_map_size_or_inner_column_size(dst_logical_col); + RETURN_IF_ERROR(physical_convert(src_physical_col, dst_logical_col)); + const size_t new_rows = get_mutable_inner_column_size(dst_logical_col) - old_rows; + align_null_map(src_physical_col, dst_logical_col, old_null_map_size, new_rows, + get_appended_null_map_start(src_physical_col, new_rows)); + return Status::OK(); + } + ColumnPtr src_logical_column; if (is_consistent()) { - if (dst_logical_type->is_nullable()) { - auto doris_nullable_column = - assert_cast(dst_logical_col.get()); - src_logical_column = - ColumnNullable::create(_cached_src_physical_column, - doris_nullable_column->get_null_map_column_ptr()); - } else { - src_logical_column = _cached_src_physical_column; - } + src_logical_column = src_physical_col; } else { src_logical_column = _logical_converter->get_column(src_logical_type, dst_logical_col, dst_logical_type); } + const size_t src_old_rows = get_mutable_inner_column_size(src_logical_column); + const size_t src_old_null_map_size = + get_null_map_size_or_inner_column_size(src_logical_column); RETURN_IF_ERROR(physical_convert(src_physical_col, src_logical_column)); + const size_t src_new_rows = + get_mutable_inner_column_size(src_logical_column) - src_old_rows; + align_null_map(src_physical_col, src_logical_column, src_old_null_map_size, src_new_rows, + get_appended_null_map_start(src_physical_col, src_new_rows)); + + dst_logical_col = IColumn::mutate(std::move(dst_logical_col)); + const size_t dst_old_rows = get_mutable_inner_column_size(dst_logical_col); + const size_t dst_old_null_map_size = + get_null_map_size_or_inner_column_size(dst_logical_col); auto converted_column = dst_logical_col->assume_mutable(); - return _logical_converter->convert(src_logical_column, converted_column); + RETURN_IF_ERROR(_logical_converter->convert(src_logical_column, converted_column)); + const size_t dst_new_rows = get_mutable_inner_column_size(dst_logical_col) - dst_old_rows; + align_null_map(src_logical_column, dst_logical_col, dst_old_null_map_size, dst_new_rows, + get_appended_null_map_start(src_logical_column, dst_new_rows)); + return Status::OK(); } virtual ColumnPtr get_physical_column(tparquet::Type::type src_physical_type, @@ -283,6 +362,11 @@ class PhysicalToLogicalConverter { DataTypePtr& get_physical_type() { return _cached_src_physical_type; } + bool read_directly_into_dst_logical_column() { + return !_convert_params->is_type_compatibility && is_consistent() && + _logical_converter->is_consistent(); + } + virtual bool is_consistent() { return false; } virtual bool support() { return true; } @@ -319,14 +403,14 @@ class LittleIntPhysicalConverter : public PhysicalToLogicalConverter { using DstCppType = typename PrimitiveTypeTraits::CppType; using DstColumnType = typename PrimitiveTypeTraits::ColumnType; ColumnPtr from_col = remove_nullable(src_physical_col); - MutableColumnPtr to_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* to_col = get_mutable_inner_column(src_logical_column); size_t rows = from_col->size(); // always comes from tparquet::Type::INT32 auto& src_data = assert_cast(from_col.get())->get_data(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = assert_cast(*to_col.get()).get_data(); + auto& data = assert_cast(*to_col).get_data(); for (int i = 0; i < rows; ++i) { data[start_idx + i] = static_cast(src_data[i]); } @@ -378,13 +462,13 @@ class UnsignedIntegerConverter : public PhysicalToLogicalConverter { using DstColumnType = typename PrimitiveTypeTraits::ColumnType; ColumnPtr from_col = remove_nullable(src_physical_col); - MutableColumnPtr to_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* to_col = get_mutable_inner_column(src_logical_column); auto& src_data = assert_cast(from_col.get())->get_data(); size_t rows = src_data.size(); size_t start_idx = to_col->size(); to_col->resize(start_idx + rows); - auto& data = assert_cast(*to_col.get()).get_data(); + auto& data = assert_cast(*to_col).get_data(); for (int i = 0; i < rows; i++) { StorageCppType src_value = src_data[i]; @@ -405,12 +489,12 @@ class FixedSizeBinaryConverter : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { ColumnPtr from_col = remove_nullable(src_physical_col); - MutableColumnPtr to_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* to_col = get_mutable_inner_column(src_logical_column); auto* src_data = assert_cast(from_col.get()); size_t length = src_data->size(); size_t num_values = length / _type_length; - auto& string_col = static_cast(*to_col.get()); + auto& string_col = static_cast(*to_col); auto& offsets = string_col.get_offsets(); auto& chars = string_col.get_chars(); @@ -441,12 +525,12 @@ class Float16PhysicalConverter : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { ColumnPtr from_col = remove_nullable(src_physical_col); - MutableColumnPtr to_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* to_col = get_mutable_inner_column(src_logical_column); const auto* src_data = assert_cast(from_col.get()); size_t length = src_data->size(); size_t num_values = length / _type_length; - auto* to_float_column = assert_cast(to_col.get()); + auto* to_float_column = assert_cast(to_col); size_t start_idx = to_float_column->size(); to_float_column->resize(start_idx + num_values); auto& to_float_column_data = to_float_column->get_data(); @@ -528,15 +612,8 @@ class UUIDVarBinaryConverter : public PhysicalToLogicalConverter { uint8_col = &assert_cast(*src_physical_col); } - MutableColumnPtr to_col = nullptr; - // nullmap flag seems have been handled in upper level - if (src_logical_column->is_nullable()) { - const auto* nullable = assert_cast(src_logical_column.get()); - to_col = nullable->get_nested_column_ptr()->assume_mutable(); - } else { - to_col = src_logical_column->assume_mutable(); - } - auto* to_varbinary_column = assert_cast(to_col.get()); + IColumn* to_col = get_mutable_inner_column(src_logical_column); + auto* to_varbinary_column = assert_cast(to_col); size_t length = uint8_col->size(); size_t num_values = length / _type_length; const auto* ptr = uint8_col->get_data().data(); @@ -561,7 +638,7 @@ class FixedSizeToDecimal : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { ColumnPtr src_col = remove_nullable(src_physical_col); - MutableColumnPtr dst_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* dst_col = get_mutable_inner_column(src_logical_column); #define M(FixedTypeLength, ValueCopyType) \ case FixedTypeLength: \ @@ -612,13 +689,13 @@ class FixedSizeToDecimal : public PhysicalToLogicalConverter { } template - Status _convert_internal(ColumnPtr& src_col, MutableColumnPtr& dst_col) { + Status _convert_internal(ColumnPtr& src_col, IColumn* dst_col) { size_t rows = src_col->size() / fixed_type_length; auto* buf = static_cast(src_col.get())->get_data().data(); size_t start_idx = dst_col->size(); dst_col->resize(start_idx + rows); - auto& data = static_cast*>(dst_col.get())->get_data(); + auto& data = static_cast*>(dst_col)->get_data(); size_t offset = 0; for (int i = 0; i < rows; i++) { // When Decimal in parquet is stored in byte arrays, binary and fixed, @@ -645,7 +722,7 @@ class StringToDecimal : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { using ValueCopyType = DecimalType::NativeType; ColumnPtr src_col = remove_nullable(src_physical_col); - MutableColumnPtr dst_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* dst_col = get_mutable_inner_column(src_logical_column); size_t rows = src_col->size(); auto buf = static_cast(src_col.get())->get_chars().data(); @@ -653,7 +730,7 @@ class StringToDecimal : public PhysicalToLogicalConverter { size_t start_idx = dst_col->size(); dst_col->resize(start_idx + rows); - auto& data = static_cast*>(dst_col.get())->get_data(); + auto& data = static_cast*>(dst_col)->get_data(); for (int i = 0; i < rows; i++) { size_t len = offset[i] - offset[i - 1]; // When Decimal in parquet is stored in byte arrays, binary and fixed, @@ -678,7 +755,7 @@ class NumberToDecimal : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { using ValueCopyType = typename DecimalType::NativeType; ColumnPtr src_col = remove_nullable(src_physical_col); - MutableColumnPtr dst_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* dst_col = get_mutable_inner_column(src_logical_column); size_t rows = src_col->size(); auto* src_data = @@ -686,7 +763,7 @@ class NumberToDecimal : public PhysicalToLogicalConverter { size_t start_idx = dst_col->size(); dst_col->resize(start_idx + rows); - auto* data = static_cast*>(dst_col.get())->get_data().data(); + auto* data = static_cast*>(dst_col)->get_data().data(); for (int i = 0; i < rows; i++) { ValueCopyType value; @@ -706,14 +783,14 @@ class NumberToDecimal : public PhysicalToLogicalConverter { class Int32ToDate : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { ColumnPtr src_col = remove_nullable(src_physical_col); - MutableColumnPtr dst_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* dst_col = get_mutable_inner_column(src_logical_column); size_t rows = src_col->size(); size_t start_idx = dst_col->size(); dst_col->reserve(start_idx + rows); auto& src_data = static_cast(src_col.get())->get_data(); - auto& data = static_cast(dst_col.get())->get_data(); + auto& data = static_cast(dst_col)->get_data(); date_day_offset_dict& date_dict = date_day_offset_dict::get(); for (int i = 0; i < rows; i++) { @@ -727,14 +804,14 @@ class Int32ToDate : public PhysicalToLogicalConverter { struct Int64ToTimestamp : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { ColumnPtr src_col = remove_nullable(src_physical_col); - MutableColumnPtr dst_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* dst_col = get_mutable_inner_column(src_logical_column); size_t rows = src_col->size(); size_t start_idx = dst_col->size(); dst_col->resize(start_idx + rows); auto src_data = static_cast(src_col.get())->get_data().data(); - auto& data = static_cast(dst_col.get())->get_data(); + auto& data = static_cast(dst_col)->get_data(); for (int i = 0; i < rows; i++) { int64_t x = src_data[i]; @@ -760,14 +837,14 @@ struct Int64ToTimestamp : public PhysicalToLogicalConverter { struct Int64ToTimestampTz : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { ColumnPtr src_col = remove_nullable(src_physical_col); - MutableColumnPtr dst_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* dst_col = get_mutable_inner_column(src_logical_column); size_t rows = src_col->size(); size_t start_idx = dst_col->size(); dst_col->resize(start_idx + rows); const auto& src_data = assert_cast(src_col.get())->get_data(); - auto& dest_data = assert_cast(dst_col.get())->get_data(); + auto& dest_data = assert_cast(dst_col)->get_data(); static const cctz::time_zone UTC = cctz::utc_time_zone(); for (int i = 0; i < rows; i++) { @@ -784,14 +861,14 @@ struct Int64ToTimestampTz : public PhysicalToLogicalConverter { struct Int96toTimestamp : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { ColumnPtr src_col = remove_nullable(src_physical_col); - MutableColumnPtr dst_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* dst_col = get_mutable_inner_column(src_logical_column); size_t rows = src_col->size() / sizeof(ParquetInt96); auto& src_data = static_cast(src_col.get())->get_data(); auto ParquetInt96_data = (ParquetInt96*)src_data.data(); size_t start_idx = dst_col->size(); dst_col->resize(start_idx + rows); - auto& data = static_cast(dst_col.get())->get_data(); + auto& data = static_cast(dst_col)->get_data(); for (int i = 0; i < rows; i++) { ParquetInt96 src_cell_data = ParquetInt96_data[i]; @@ -818,14 +895,14 @@ struct Int96toTimestamp : public PhysicalToLogicalConverter { struct Int96toTimestampTz : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { ColumnPtr src_col = remove_nullable(src_physical_col); - MutableColumnPtr dst_col = remove_nullable(src_logical_column)->assume_mutable(); + IColumn* dst_col = get_mutable_inner_column(src_logical_column); size_t rows = src_col->size() / sizeof(ParquetInt96); const auto& src_data = assert_cast(src_col.get())->get_data(); auto* ParquetInt96_data = (ParquetInt96*)src_data.data(); size_t start_idx = dst_col->size(); dst_col->resize(start_idx + rows); - auto& data = assert_cast(dst_col.get())->get_data(); + auto& data = assert_cast(dst_col)->get_data(); static const cctz::time_zone UTC = cctz::utc_time_zone(); for (int i = 0; i < rows; i++) { diff --git a/be/src/format/parquet/vparquet_column_reader.cpp b/be/src/format/parquet/vparquet_column_reader.cpp index ba7d42a5aed84e..1deffec6a04633 100644 --- a/be/src/format/parquet/vparquet_column_reader.cpp +++ b/be/src/format/parquet/vparquet_column_reader.cpp @@ -328,12 +328,11 @@ Status ScalarColumnReader::_read_values(size_t num_ MutableColumnPtr data_column; std::vector null_map; NullMap* map_data_column = nullptr; + doris_column = IColumn::mutate(std::move(doris_column)); if (doris_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); - // doris_column either originates from a mutable block in vparquet_group_reader - // or is a newly created ColumnPtr, and therefore can be modified. - auto* nullable_column = - assert_cast(const_cast(doris_column.get())); + auto mutable_column = doris_column->assume_mutable(); + auto* nullable_column = assert_cast(mutable_column.get()); data_column = nullable_column->get_nested_column_ptr(); map_data_column = &(nullable_column->get_null_map_data()); @@ -411,12 +410,11 @@ Status ScalarColumnReader::_read_nested_column( // Handle nullable columns MutableColumnPtr data_column; NullMap* map_data_column = nullptr; + doris_column = IColumn::mutate(std::move(doris_column)); if (doris_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); - // doris_column either originates from a mutable block in vparquet_group_reader - // or is a newly created ColumnPtr, and therefore can be modified. - auto* nullable_column = - const_cast(assert_cast(doris_column.get())); + auto mutable_column = doris_column->assume_mutable(); + auto* nullable_column = assert_cast(mutable_column.get()); data_column = nullable_column->get_nested_column_ptr(); map_data_column = &(nullable_column->get_null_map_data()); } else { @@ -550,6 +548,10 @@ Status ScalarColumnReader::read_column_data( ColumnPtr resolved_column = _converter->get_physical_column(_field_schema->physical_type, _field_schema->data_type, doris_column, type, is_dict_filter); + if (_converter->read_directly_into_dst_logical_column()) { + DCHECK_EQ(resolved_column.get(), doris_column.get()); + resolved_column = std::move(doris_column); + } DataTypePtr& resolved_type = _converter->get_physical_type(); _def_levels.clear(); @@ -658,6 +660,7 @@ Status ArrayColumnReader::read_column_data( int64_t real_column_size) { MutableColumnPtr data_column; NullMap* null_map_ptr = nullptr; + doris_column = IColumn::mutate(std::move(doris_column)); if (doris_column->is_nullable()) { auto mutable_column = doris_column->assume_mutable(); auto* nullable_column = assert_cast(mutable_column.get()); @@ -713,6 +716,7 @@ Status MapColumnReader::read_column_data( int64_t real_column_size) { MutableColumnPtr data_column; NullMap* null_map_ptr = nullptr; + doris_column = IColumn::mutate(std::move(doris_column)); if (doris_column->is_nullable()) { auto mutable_column = doris_column->assume_mutable(); auto* nullable_column = assert_cast(mutable_column.get()); @@ -789,6 +793,7 @@ Status StructColumnReader::read_column_data( int64_t real_column_size) { MutableColumnPtr data_column; NullMap* null_map_ptr = nullptr; + doris_column = IColumn::mutate(std::move(doris_column)); if (doris_column->is_nullable()) { auto mutable_column = doris_column->assume_mutable(); auto* nullable_column = assert_cast(mutable_column.get()); @@ -986,6 +991,7 @@ Status StructColumnReader::read_column_data( auto& doris_field = doris_struct.get_column_ptr(idx); auto& doris_type = doris_struct_type->get_element(idx); DCHECK(doris_type->is_nullable()); + doris_field = IColumn::mutate(std::move(doris_field)); auto mutable_column = doris_field->assume_mutable(); auto* nullable_column = static_cast(mutable_column.get()); nullable_column->insert_many_defaults(missing_column_sz); diff --git a/be/src/format/parquet/vparquet_column_reader.h b/be/src/format/parquet/vparquet_column_reader.h index 9d9fd2280c88f8..8673361eb46dd6 100644 --- a/be/src/format/parquet/vparquet_column_reader.h +++ b/be/src/format/parquet/vparquet_column_reader.h @@ -482,6 +482,7 @@ class SkipReadingReader : public ParquetColumnReader { // Simulate reading without actually reading data // Fill with default/null values based on column type + doris_column = IColumn::mutate(std::move(doris_column)); MutableColumnPtr data_column = doris_column->assume_mutable(); if (real_column_size > 0) { diff --git a/be/src/format/parquet/vparquet_group_reader.cpp b/be/src/format/parquet/vparquet_group_reader.cpp index 7d910e49203062..f2db75afd6d1b0 100644 --- a/be/src/format/parquet/vparquet_group_reader.cpp +++ b/be/src/format/parquet/vparquet_group_reader.cpp @@ -35,6 +35,7 @@ #include "core/assert_cast.h" #include "core/block/block.h" #include "core/block/column_with_type_and_name.h" +#include "core/column/column.h" #include "core/column/column_const.h" #include "core/column/column_nullable.h" #include "core/column/column_string.h" @@ -415,9 +416,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ } if (can_filter_all) { - for (auto& col : columns_to_filter) { - std::move(*block->get_by_position(col).column).assume_mutable()->clear(); - } + block->clear_column_data(columns_to_filter); Block::erase_useless_column(block, column_to_keep); RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block)); return Status::OK(); @@ -668,7 +667,9 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re if (_lazy_read_ctx.resize_first_column) { // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0 // The following process may be tricky and time-consuming, but we have no other way. - block->get_by_position(0).column->assume_mutable()->resize(pre_read_rows); + auto column = IColumn::mutate(std::move(block->get_by_position(0).column)); + column->resize(pre_read_rows); + block->replace_by_position(0, std::move(column)); } result_filter.assign(pre_read_rows, static_cast(1)); std::vector filters; @@ -693,7 +694,7 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re if (_lazy_read_ctx.resize_first_column) { // We have to clean the first column to insert right data. - block->get_by_position(0).column->assume_mutable()->clear(); + block->clear_column_data(std::vector {0}); } } @@ -703,22 +704,27 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re if (filter_map_ptr->filter_all()) { { SCOPED_RAW_TIMER(&_predicate_filter_time); + std::vector columns_to_clear; + columns_to_clear.reserve(_lazy_read_ctx.predicate_columns.first.size() + + _lazy_read_ctx.predicate_partition_columns.size() + + _lazy_read_ctx.predicate_missing_columns.size()); for (const auto& col : _lazy_read_ctx.predicate_columns.first) { // clean block to read predicate columns uint32_t block_pos = 0; RETURN_IF_ERROR(_get_block_column_pos(*block, col, &block_pos)); - block->get_by_position(block_pos).column->assume_mutable()->clear(); + columns_to_clear.emplace_back(block_pos); } for (const auto& col : _lazy_read_ctx.predicate_partition_columns) { uint32_t block_pos = 0; RETURN_IF_ERROR(_get_block_column_pos(*block, col.first, &block_pos)); - block->get_by_position(block_pos).column->assume_mutable()->clear(); + columns_to_clear.emplace_back(block_pos); } for (const auto& col : _lazy_read_ctx.predicate_missing_columns) { uint32_t block_pos = 0; RETURN_IF_ERROR(_get_block_column_pos(*block, col.first, &block_pos)); - block->get_by_position(block_pos).column->assume_mutable()->clear(); + columns_to_clear.emplace_back(block_pos); } + block->clear_column_data(columns_to_clear); RETURN_IF_ERROR(_table_format_reader->clear_synthesized_columns(block)); RETURN_IF_ERROR(_table_format_reader->clear_generated_columns(block)); Block::erase_useless_column(block, origin_column_num); diff --git a/be/src/format/table/equality_delete.cpp b/be/src/format/table/equality_delete.cpp index 82deb7bd59c20a..dc94d8151f2048 100644 --- a/be/src/format/table/equality_delete.cpp +++ b/be/src/format/table/equality_delete.cpp @@ -68,9 +68,8 @@ Status SimpleEqualityDelete::filter_data_block( const NullMap& null_map = reinterpret_cast(column_and_type.column.get()) ->get_null_map_data(); - _hybrid_set->find_batch_nullable( - remove_nullable(column_and_type.column)->assume_mutable_ref(), rows, null_map, - *_single_filter); + _hybrid_set->find_batch_nullable(*remove_nullable(column_and_type.column), rows, null_map, + *_single_filter); if (_hybrid_set->contain_null()) { auto* filter_data = _single_filter->data(); for (size_t i = 0; i < rows; ++i) { @@ -78,8 +77,7 @@ Status SimpleEqualityDelete::filter_data_block( } } } else { - _hybrid_set->find_batch(column_and_type.column->assume_mutable_ref(), rows, - *_single_filter); + _hybrid_set->find_batch(*column_and_type.column, rows, *_single_filter); } // should reverse _filter auto* filter_data = filter.data(); diff --git a/be/src/format/table/iceberg_reader_mixin.h b/be/src/format/table/iceberg_reader_mixin.h index 42c80c9b7d4ddc..c02cecfb0430b0 100644 --- a/be/src/format/table/iceberg_reader_mixin.h +++ b/be/src/format/table/iceberg_reader_mixin.h @@ -554,6 +554,7 @@ Status IcebergReaderMixin::_equality_delete_base( if (read_rows > 0) { MutableBlock mutable_block(&eq_file_block); RETURN_IF_ERROR(mutable_block.merge(tmp_block)); + eq_file_block = mutable_block.to_block(); } } } @@ -586,13 +587,12 @@ Status IcebergReaderMixin::_expand_block_if_need(Block* block) { auto block_names = block->get_names(); names.insert(block_names.begin(), block_names.end()); for (auto& col : _expand_columns) { - col.column->assume_mutable()->clear(); if (names.contains(col.name)) { return Status::InternalError("Wrong expand column '{}'", col.name); } names.insert(col.name); (*this->col_name_to_block_idx_ref())[col.name] = static_cast(block->columns()); - block->insert(col); + block->insert({col.type->create_column(), col.type, col.name}); } return Status::OK(); } diff --git a/be/src/format/table/paimon_cpp_reader.cpp b/be/src/format/table/paimon_cpp_reader.cpp index 4925bbb3e7a9bd..e628c30af737ba 100644 --- a/be/src/format/table/paimon_cpp_reader.cpp +++ b/be/src/format/table/paimon_cpp_reader.cpp @@ -117,6 +117,7 @@ Status PaimonCppReader::_do_get_next_block(Block* block, size_t* read_rows, bool auto record_batch = std::move(import_result).ValueUnsafe(); const auto num_rows = static_cast(record_batch->num_rows()); const auto num_columns = record_batch->num_columns(); + auto columns = block->mutate_columns(); for (int c = 0; c < num_columns; ++c) { const auto& field = record_batch->schema()->field(c); if (field->name() == VALUE_KIND_FIELD) { @@ -128,16 +129,17 @@ Status PaimonCppReader::_do_get_next_block(Block* block, size_t* read_rows, bool // Skip columns that are not in the block (e.g., partition columns handled elsewhere) continue; } - const ColumnWithTypeAndName& column_with_name = block->get_by_position(it->second); + const auto block_pos = it->second; + const ColumnWithTypeAndName& column_with_name = block->get_by_position(block_pos); try { RETURN_IF_ERROR(column_with_name.type->get_serde()->read_column_from_arrow( - column_with_name.column->assume_mutable_ref(), record_batch->column(c).get(), 0, - num_rows, _ctzz)); + *columns[block_pos], record_batch->column(c).get(), 0, num_rows, _ctzz)); } catch (Exception& e) { return Status::InternalError("Failed to convert from arrow to block: {}", e.what()); } } + block->set_columns(std::move(columns)); *read_rows = num_rows; *eof = false; return Status::OK(); diff --git a/be/src/format/table/parquet_metadata_reader.cpp b/be/src/format/table/parquet_metadata_reader.cpp index 7df80f673cb602..054bd1929a2e16 100644 --- a/be/src/format/table/parquet_metadata_reader.cpp +++ b/be/src/format/table/parquet_metadata_reader.cpp @@ -29,6 +29,7 @@ #include #include "core/block/block.h" +#include "core/column/column.h" #include "core/column/column_map.h" #include "core/column/column_nullable.h" #include "core/data_type/data_type_nullable.h" @@ -810,9 +811,9 @@ Status ParquetMetadataReader::_do_get_next_block(Block* block, size_t* read_rows bool mem_reuse = block->mem_reuse(); std::vector columns(_slots.size()); if (mem_reuse) { - block->clear_column_data(); for (size_t i = 0; i < _slots.size(); ++i) { - columns[i] = block->get_by_position(i).column->assume_mutable(); + columns[i] = IColumn::mutate(std::move(block->get_by_position(i).column)); + columns[i]->clear(); } } else { for (size_t i = 0; i < _slots.size(); ++i) { @@ -829,7 +830,7 @@ Status ParquetMetadataReader::_do_get_next_block(Block* block, size_t* read_rows std::move(columns[i]), _slots[i]->get_data_type_ptr(), _slots[i]->col_name())); } } else { - columns.clear(); + block->set_columns(std::move(columns)); } size_t produced = block->rows() - rows_before; diff --git a/be/src/format/table/remote_doris_reader.cpp b/be/src/format/table/remote_doris_reader.cpp index 5280b655a63ef8..487aad2869b90d 100644 --- a/be/src/format/table/remote_doris_reader.cpp +++ b/be/src/format/table/remote_doris_reader.cpp @@ -72,6 +72,7 @@ Status RemoteDorisReader::_do_get_next_block(Block* block, size_t* read_rows, bo auto batch = chunk.data; auto num_rows = batch->num_rows(); auto num_columns = batch->num_columns(); + auto columns = block->mutate_columns(); for (int c = 0; c < num_columns; ++c) { arrow::Array* column = batch->column(c).get(); @@ -82,10 +83,10 @@ Status RemoteDorisReader::_do_get_next_block(Block* block, size_t* read_rows, bo } try { - const ColumnWithTypeAndName& column_with_name = - block->get_by_position((*_col_name_to_block_idx)[column_name]); + auto block_pos = (*_col_name_to_block_idx)[column_name]; + const ColumnWithTypeAndName& column_with_name = block->get_by_position(block_pos); RETURN_IF_ERROR(column_with_name.type->get_serde()->read_column_from_arrow( - column_with_name.column->assume_mutable_ref(), column, 0, num_rows, _ctzz)); + *columns[block_pos], column, 0, num_rows, _ctzz)); } catch (Exception& e) { return Status::InternalError( "Failed to convert from arrow to block, column_name: {}, e: {}", column_name, @@ -93,6 +94,7 @@ Status RemoteDorisReader::_do_get_next_block(Block* block, size_t* read_rows, bo } } + block->set_columns(std::move(columns)); *read_rows += num_rows; return Status::OK(); diff --git a/be/src/format/table/table_format_reader.h b/be/src/format/table/table_format_reader.h index 23587ab1095700..9beff637b98533 100644 --- a/be/src/format/table/table_format_reader.h +++ b/be/src/format/table/table_format_reader.h @@ -147,7 +147,7 @@ class TableFormatReader : public GenericReader { if (col_pos < 0) { continue; } - block->get_by_position(static_cast(col_pos)).column->assume_mutable()->clear(); + block->clear_column_data(std::vector {static_cast(col_pos)}); } return Status::OK(); } @@ -212,7 +212,7 @@ class TableFormatReader : public GenericReader { if (col_pos < 0) { continue; } - block->get_by_position(static_cast(col_pos)).column->assume_mutable()->clear(); + block->clear_column_data(std::vector {static_cast(col_pos)}); } return Status::OK(); } diff --git a/be/src/information_schema/schema_active_queries_scanner.cpp b/be/src/information_schema/schema_active_queries_scanner.cpp index 00f0c5b5de763e..de0844af8abc93 100644 --- a/be/src/information_schema/schema_active_queries_scanner.cpp +++ b/be/src/information_schema/schema_active_queries_scanner.cpp @@ -133,6 +133,7 @@ Status SchemaActiveQueriesScanner::get_next_block_internal(Block* block, bool* e int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_active_query_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_authentication_integrations_scanner.cpp b/be/src/information_schema/schema_authentication_integrations_scanner.cpp index 95359b58264d15..4cbf55b198d31b 100644 --- a/be/src/information_schema/schema_authentication_integrations_scanner.cpp +++ b/be/src/information_schema/schema_authentication_integrations_scanner.cpp @@ -137,6 +137,7 @@ Status SchemaAuthenticationIntegrationsScanner::get_next_block_internal(Block* b MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_authentication_integrations_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_backend_active_tasks.cpp b/be/src/information_schema/schema_backend_active_tasks.cpp index b41f116b7550af..ddb15b84aa409d 100644 --- a/be/src/information_schema/schema_backend_active_tasks.cpp +++ b/be/src/information_schema/schema_backend_active_tasks.cpp @@ -89,10 +89,11 @@ Status SchemaBackendActiveTasksScanner::get_next_block_internal(Block* block, bo int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_task_stats_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; return Status::OK(); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/information_schema/schema_backend_kerberos_ticket_cache.cpp b/be/src/information_schema/schema_backend_kerberos_ticket_cache.cpp index 3c7b1ec0bc5c9a..5b25a84304d1bb 100644 --- a/be/src/information_schema/schema_backend_kerberos_ticket_cache.cpp +++ b/be/src/information_schema/schema_backend_kerberos_ticket_cache.cpp @@ -85,6 +85,7 @@ Status SchemaBackendKerberosTicketCacheScanner::get_next_block_internal(Block* b int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_info_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_catalog_meta_cache_stats_scanner.cpp b/be/src/information_schema/schema_catalog_meta_cache_stats_scanner.cpp index fec899c252a933..18e490f09b3fed 100644 --- a/be/src/information_schema/schema_catalog_meta_cache_stats_scanner.cpp +++ b/be/src/information_schema/schema_catalog_meta_cache_stats_scanner.cpp @@ -145,6 +145,7 @@ Status SchemaCatalogMetaCacheStatsScanner::get_next_block_internal(Block* block, int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_database_properties_scanner.cpp b/be/src/information_schema/schema_database_properties_scanner.cpp index c73dd9301e056d..d1427fe43e915f 100644 --- a/be/src/information_schema/schema_database_properties_scanner.cpp +++ b/be/src/information_schema/schema_database_properties_scanner.cpp @@ -149,6 +149,7 @@ Status SchemaDatabasePropertiesScanner::get_next_block_internal(Block* block, bo int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_dbproperties_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; if (!check_and_mark_eos(eos)) { *eos = false; diff --git a/be/src/information_schema/schema_file_cache_statistics.cpp b/be/src/information_schema/schema_file_cache_statistics.cpp index 0b69766bbeeae9..5be2df30d53b11 100644 --- a/be/src/information_schema/schema_file_cache_statistics.cpp +++ b/be/src/information_schema/schema_file_cache_statistics.cpp @@ -77,6 +77,7 @@ Status SchemaFileCacheStatisticsScanner::get_next_block_internal(Block* block, b int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_stats_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_partitions_scanner.cpp b/be/src/information_schema/schema_partitions_scanner.cpp index 834fd928f7126e..87c0ce078b787d 100644 --- a/be/src/information_schema/schema_partitions_scanner.cpp +++ b/be/src/information_schema/schema_partitions_scanner.cpp @@ -210,6 +210,7 @@ Status SchemaPartitionsScanner::get_next_block_internal(Block* block, bool* eos) int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_partitions_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; if (!check_and_mark_eos(eos)) { diff --git a/be/src/information_schema/schema_role_mappings_scanner.cpp b/be/src/information_schema/schema_role_mappings_scanner.cpp index 31e58e6cbe9fb5..84d0e26eb44393 100644 --- a/be/src/information_schema/schema_role_mappings_scanner.cpp +++ b/be/src/information_schema/schema_role_mappings_scanner.cpp @@ -134,6 +134,7 @@ Status SchemaRoleMappingsScanner::get_next_block_internal(Block* block, bool* eo int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_role_mappings_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_scanner.cpp b/be/src/information_schema/schema_scanner.cpp index c056617e3972e0..b2b38c83d66410 100644 --- a/be/src/information_schema/schema_scanner.cpp +++ b/be/src/information_schema/schema_scanner.cpp @@ -96,6 +96,24 @@ namespace doris { class ObjectPool; +namespace { + +void insert_column_range(ColumnWithTypeAndName* dst, const ColumnWithTypeAndName& src, size_t start, + size_t length) { + DORIS_CHECK(dst->column.get() != nullptr); + DORIS_CHECK(src.column.get() != nullptr); + MutableColumnPtr dst_column = IColumn::mutate(std::move(dst->column)); + ColumnPtr src_column = src.column->convert_to_full_column_if_const(); + if (dst_column->is_nullable() && !src_column->is_nullable()) { + src_column = make_nullable(src_column); + } + DORIS_CHECK(dst_column->is_nullable() == src_column->is_nullable()); + dst_column->insert_range_from(*src_column, start, length); + dst->column = std::move(dst_column); +} + +} // namespace + SchemaScanner::SchemaScanner(const std::vector& columns, TSchemaTableType::type type) : _is_init(false), _columns(columns), _schema_table_type(type) {} @@ -116,10 +134,8 @@ Status SchemaScanner::get_next_block(RuntimeState* state, Block* block, bool* eo DCHECK(_async_thread_running == false); RETURN_IF_ERROR(_scanner_status.status()); for (size_t i = 0; i < block->columns(); i++) { - std::move(*block->get_by_position(i).column) - .mutate() - ->insert_range_from(*_data_block->get_by_position(i).column, 0, - _data_block->rows()); + insert_column_range(&block->get_by_position(i), _data_block->get_by_position(i), 0, + _data_block->rows()); } _data_block->clear_column_data(); *eos = _eos; @@ -298,11 +314,10 @@ void SchemaScanner::_init_block(Block* src_block) { Status SchemaScanner::fill_dest_column_for_range(Block* block, size_t pos, const std::vector& datas) { const ColumnDesc& col_desc = _columns[pos]; - MutableColumnPtr column_ptr; - column_ptr = std::move(*block->get_by_position(pos).column).assume_mutable(); + MutableColumnPtr column_ptr = IColumn::mutate(std::move(block->get_by_position(pos).column)); IColumn* col_ptr = column_ptr.get(); - auto* nullable_column = reinterpret_cast(col_ptr); + auto* nullable_column = assert_cast(col_ptr); // Resize in advance to improve insertion efficiency. size_t fill_num = datas.size(); @@ -443,6 +458,7 @@ Status SchemaScanner::fill_dest_column_for_range(Block* block, size_t pos, } } } + block->replace_by_position(pos, std::move(column_ptr)); return Status::OK(); } @@ -457,8 +473,8 @@ std::string SchemaScanner::get_db_from_full_name(const std::string& full_name) { Status SchemaScanner::insert_block_column(TCell cell, int col_index, Block* block, PrimitiveType type) { MutableColumnPtr mutable_col_ptr; - mutable_col_ptr = std::move(*block->get_by_position(col_index).column).assume_mutable(); - auto* nullable_column = reinterpret_cast(mutable_col_ptr.get()); + mutable_col_ptr = IColumn::mutate(std::move(block->get_by_position(col_index).column)); + auto* nullable_column = assert_cast(mutable_col_ptr.get()); IColumn* col_ptr = &nullable_column->get_nested_column(); switch (type) { @@ -513,6 +529,7 @@ Status SchemaScanner::insert_block_column(TCell cell, int col_index, Block* bloc } } nullable_column->push_false_to_nullmap(1); + block->replace_by_position(col_index, std::move(mutable_col_ptr)); return Status::OK(); } diff --git a/be/src/information_schema/schema_scanner_helper.cpp b/be/src/information_schema/schema_scanner_helper.cpp index 9ec2cdcd7cbaa2..7907dc264b66fd 100644 --- a/be/src/information_schema/schema_scanner_helper.cpp +++ b/be/src/information_schema/schema_scanner_helper.cpp @@ -19,6 +19,7 @@ #include "cctz/time_zone.h" #include "core/block/block.h" +#include "core/column/column_nullable.h" #include "core/data_type/data_type_factory.hpp" #include "core/data_type/primitive_type.h" #include "core/string_ref.h" @@ -31,29 +32,31 @@ namespace doris { void SchemaScannerHelper::insert_string_value(int col_index, std::string_view str_val, Block* block) { - MutableColumnPtr mutable_col_ptr; - mutable_col_ptr = block->get_by_position(col_index).column->assume_mutable(); + MutableColumnPtr mutable_col_ptr = + IColumn::mutate(std::move(block->get_by_position(col_index).column)); auto* nullable_column = assert_cast(mutable_col_ptr.get()); IColumn* col_ptr = &nullable_column->get_nested_column(); assert_cast(col_ptr)->insert_data(str_val.data(), str_val.size()); nullable_column->push_false_to_nullmap(1); + block->replace_by_position(col_index, std::move(mutable_col_ptr)); } void SchemaScannerHelper::insert_datetime_value(int col_index, const std::vector& datas, Block* block) { - MutableColumnPtr mutable_col_ptr; - mutable_col_ptr = block->get_by_position(col_index).column->assume_mutable(); + MutableColumnPtr mutable_col_ptr = + IColumn::mutate(std::move(block->get_by_position(col_index).column)); auto* nullable_column = assert_cast(mutable_col_ptr.get()); IColumn* col_ptr = &nullable_column->get_nested_column(); auto data = datas[0]; assert_cast(col_ptr)->insert_data(reinterpret_cast(data), 0); nullable_column->push_false_to_nullmap(1); + block->replace_by_position(col_index, std::move(mutable_col_ptr)); } void SchemaScannerHelper::insert_datetime_value(int col_index, int64_t timestamp, const cctz::time_zone& ctz, Block* block) { - MutableColumnPtr mutable_col_ptr; - mutable_col_ptr = block->get_by_position(col_index).column->assume_mutable(); + MutableColumnPtr mutable_col_ptr = + IColumn::mutate(std::move(block->get_by_position(col_index).column)); auto* nullable_column = assert_cast(mutable_col_ptr.get()); IColumn* col_ptr = &nullable_column->get_nested_column(); @@ -64,41 +67,46 @@ void SchemaScannerHelper::insert_datetime_value(int col_index, int64_t timestamp auto data = datas[0]; assert_cast(col_ptr)->insert_data(reinterpret_cast(data), 0); nullable_column->push_false_to_nullmap(1); + block->replace_by_position(col_index, std::move(mutable_col_ptr)); } void SchemaScannerHelper::insert_bool_value(int col_index, bool bool_val, Block* block) { - MutableColumnPtr mutable_col_ptr; - mutable_col_ptr = block->get_by_position(col_index).column->assume_mutable(); + MutableColumnPtr mutable_col_ptr = + IColumn::mutate(std::move(block->get_by_position(col_index).column)); auto* nullable_column = assert_cast(mutable_col_ptr.get()); IColumn* col_ptr = &nullable_column->get_nested_column(); assert_cast(col_ptr)->insert_value(bool_val); nullable_column->push_false_to_nullmap(1); + block->replace_by_position(col_index, std::move(mutable_col_ptr)); } void SchemaScannerHelper::insert_int32_value(int col_index, int32_t int_val, Block* block) { - MutableColumnPtr mutable_col_ptr; - mutable_col_ptr = block->get_by_position(col_index).column->assume_mutable(); + MutableColumnPtr mutable_col_ptr = + IColumn::mutate(std::move(block->get_by_position(col_index).column)); auto* nullable_column = assert_cast(mutable_col_ptr.get()); IColumn* col_ptr = &nullable_column->get_nested_column(); assert_cast(col_ptr)->insert_value(int_val); nullable_column->push_false_to_nullmap(1); + block->replace_by_position(col_index, std::move(mutable_col_ptr)); } void SchemaScannerHelper::insert_int64_value(int col_index, int64_t int_val, Block* block) { - MutableColumnPtr mutable_col_ptr; - mutable_col_ptr = block->get_by_position(col_index).column->assume_mutable(); + MutableColumnPtr mutable_col_ptr = + IColumn::mutate(std::move(block->get_by_position(col_index).column)); auto* nullable_column = assert_cast(mutable_col_ptr.get()); IColumn* col_ptr = &nullable_column->get_nested_column(); assert_cast(col_ptr)->insert_value(int_val); nullable_column->push_false_to_nullmap(1); + block->replace_by_position(col_index, std::move(mutable_col_ptr)); } void SchemaScannerHelper::insert_double_value(int col_index, double double_val, Block* block) { - MutableColumnPtr mutable_col_ptr; - mutable_col_ptr = block->get_by_position(col_index).column->assume_mutable(); + MutableColumnPtr mutable_col_ptr = + IColumn::mutate(std::move(block->get_by_position(col_index).column)); auto* nullable_column = assert_cast(mutable_col_ptr.get()); IColumn* col_ptr = &nullable_column->get_nested_column(); assert_cast(col_ptr)->insert_value(double_val); nullable_column->push_false_to_nullmap(1); + block->replace_by_position(col_index, std::move(mutable_col_ptr)); } } // namespace doris diff --git a/be/src/information_schema/schema_sql_block_rule_status_scanner.cpp b/be/src/information_schema/schema_sql_block_rule_status_scanner.cpp index 2a223c144ba5fa..1fcc0cb838ad93 100644 --- a/be/src/information_schema/schema_sql_block_rule_status_scanner.cpp +++ b/be/src/information_schema/schema_sql_block_rule_status_scanner.cpp @@ -170,6 +170,7 @@ Status SchemaSqlBlockRuleStatusScanner::get_next_block_internal(Block* block, bo MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR( mblock.add_rows(_sql_block_rule_status_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_table_options_scanner.cpp b/be/src/information_schema/schema_table_options_scanner.cpp index 096f0860bfc3bd..717cb91cccfa29 100644 --- a/be/src/information_schema/schema_table_options_scanner.cpp +++ b/be/src/information_schema/schema_table_options_scanner.cpp @@ -167,6 +167,7 @@ Status SchemaTableOptionsScanner::get_next_block_internal(Block* block, bool* eo int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_tableoptions_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; if (!check_and_mark_eos(eos)) { diff --git a/be/src/information_schema/schema_table_properties_scanner.cpp b/be/src/information_schema/schema_table_properties_scanner.cpp index 0affe500b35f7b..e89153542a190c 100644 --- a/be/src/information_schema/schema_table_properties_scanner.cpp +++ b/be/src/information_schema/schema_table_properties_scanner.cpp @@ -161,6 +161,7 @@ Status SchemaTablePropertiesScanner::get_next_block_internal(Block* block, bool* int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_tableproperties_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; if (!check_and_mark_eos(eos)) { diff --git a/be/src/information_schema/schema_table_stream_consumption_scanner.cpp b/be/src/information_schema/schema_table_stream_consumption_scanner.cpp index c2c5ceab41ceb2..6b3141e404bf27 100644 --- a/be/src/information_schema/schema_table_stream_consumption_scanner.cpp +++ b/be/src/information_schema/schema_table_stream_consumption_scanner.cpp @@ -132,10 +132,11 @@ Status SchemaTableStreamConsumptionScanner::get_next_block_internal(Block* block MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR( mblock.add_rows(_table_stream_consumption_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; return Status::OK(); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/information_schema/schema_table_streams_scanner.cpp b/be/src/information_schema/schema_table_streams_scanner.cpp index 288d4e56c9a876..48299c7a1783c6 100644 --- a/be/src/information_schema/schema_table_streams_scanner.cpp +++ b/be/src/information_schema/schema_table_streams_scanner.cpp @@ -132,10 +132,11 @@ Status SchemaTableStreamsScanner::get_next_block_internal(Block* block, bool* eo int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_table_streams_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; return Status::OK(); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/information_schema/schema_view_dependency_scanner.cpp b/be/src/information_schema/schema_view_dependency_scanner.cpp index 1aa6ce614312f7..3723f4f9e5e2a3 100644 --- a/be/src/information_schema/schema_view_dependency_scanner.cpp +++ b/be/src/information_schema/schema_view_dependency_scanner.cpp @@ -133,6 +133,7 @@ Status SchemaViewDependencyScanner::get_next_block_internal(Block* block, bool* int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_view_dependency_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_workload_group_privileges.cpp b/be/src/information_schema/schema_workload_group_privileges.cpp index d0dab55965c3d1..854e151fd2521d 100644 --- a/be/src/information_schema/schema_workload_group_privileges.cpp +++ b/be/src/information_schema/schema_workload_group_privileges.cpp @@ -128,6 +128,7 @@ Status SchemaWorkloadGroupPrivilegesScanner::get_next_block_internal(Block* bloc MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR( mblock.add_rows(_workload_groups_privs_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_workload_group_resource_usage_scanner.cpp b/be/src/information_schema/schema_workload_group_resource_usage_scanner.cpp index 175b1dbd080e81..f790bf913bb75c 100644 --- a/be/src/information_schema/schema_workload_group_resource_usage_scanner.cpp +++ b/be/src/information_schema/schema_workload_group_resource_usage_scanner.cpp @@ -80,10 +80,11 @@ Status SchemaBackendWorkloadGroupResourceUsage::get_next_block_internal(Block* b int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; return Status::OK(); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/information_schema/schema_workload_groups_scanner.cpp b/be/src/information_schema/schema_workload_groups_scanner.cpp index 5ad1b744e975e6..b2dd403f48652b 100644 --- a/be/src/information_schema/schema_workload_groups_scanner.cpp +++ b/be/src/information_schema/schema_workload_groups_scanner.cpp @@ -139,6 +139,7 @@ Status SchemaWorkloadGroupsScanner::get_next_block_internal(Block* block, bool* int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_workload_groups_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_workload_sched_policy_scanner.cpp b/be/src/information_schema/schema_workload_sched_policy_scanner.cpp index 040b747bb435c4..bc5d5f9c229e4c 100644 --- a/be/src/information_schema/schema_workload_sched_policy_scanner.cpp +++ b/be/src/information_schema/schema_workload_sched_policy_scanner.cpp @@ -129,6 +129,7 @@ Status SchemaWorkloadSchedulePolicyScanner::get_next_block_internal(Block* block int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); MutableBlock mblock = MutableBlock::build_mutable_block(block); RETURN_IF_ERROR(mblock.add_rows(_block.get(), _row_idx, current_batch_rows)); + block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/load/memtable/memtable.cpp b/be/src/load/memtable/memtable.cpp index 588d8543d7b4b4..3bdcaa1ef961d7 100644 --- a/be/src/load/memtable/memtable.cpp +++ b/be/src/load/memtable/memtable.cpp @@ -453,12 +453,11 @@ void MemTable::_sort_one_column(DorisVector>& row_in } template -void MemTable::_finalize_one_row(RowInBlock* row, const ColumnsWithTypeAndName& block_data, - int row_pos) { +void MemTable::_finalize_one_row(RowInBlock* row, MutableBlock& mutable_block, int row_pos) { // move key columns for (size_t i = 0; i < _tablet_schema->num_key_columns(); ++i) { - _output_mutable_block.get_column_by_position(i)->insert_from(*block_data[i].column.get(), - row->_row_pos); + _output_mutable_block.get_column_by_position(i)->insert_from( + *mutable_block.get_column_by_position(i), row->_row_pos); } if (row->has_init_agg()) { // get value columns from agg_places @@ -490,7 +489,7 @@ void MemTable::_finalize_one_row(RowInBlock* row, const ColumnsWithTypeAndName& // move columns for rows do not need agg for (size_t i = _tablet_schema->num_key_columns(); i < _num_columns; ++i) { _output_mutable_block.get_column_by_position(i)->insert_from( - *block_data[i].column.get(), row->_row_pos); + *mutable_block.get_column_by_position(i), row->_row_pos); } } if constexpr (!is_final) { @@ -527,7 +526,6 @@ void MemTable::_aggregate() { Block in_block = _input_mutable_block.to_block(); MutableBlock mutable_block = MutableBlock::build_mutable_block(&in_block); _vec_row_comparator->set_block(&mutable_block); - auto& block_data = in_block.get_columns_with_type_and_name(); DorisVector> temp_row_in_blocks; temp_row_in_blocks.reserve(_last_sorted_pos); //only init agg if needed @@ -558,7 +556,7 @@ void MemTable::_aggregate() { if (!temp_row_in_blocks.empty()) { // The rows from the previous batch of _row_in_blocks have been merged into temp_row_in_blocks, // now call finalize to write the aggregation results into _output_mutable_block. - _finalize_one_row(temp_row_in_blocks.back().get(), block_data, + _finalize_one_row(temp_row_in_blocks.back().get(), mutable_block, row_pos); } temp_row_in_blocks.push_back(cur_row_ptr); @@ -567,15 +565,15 @@ void MemTable::_aggregate() { } if (!temp_row_in_blocks.empty()) { // finalize the last low - _finalize_one_row(temp_row_in_blocks.back().get(), block_data, row_pos); + _finalize_one_row(temp_row_in_blocks.back().get(), mutable_block, row_pos); } } else { DCHECK(_delete_sign_col_idx != -1); if (_seq_col_idx_in_block == -1) { - _aggregate_for_flexible_partial_update_without_seq_col( - block_data, mutable_block, temp_row_in_blocks); + _aggregate_for_flexible_partial_update_without_seq_col(mutable_block, + temp_row_in_blocks); } else { - _aggregate_for_flexible_partial_update_with_seq_col(block_data, mutable_block, + _aggregate_for_flexible_partial_update_with_seq_col(mutable_block, temp_row_in_blocks); } } @@ -593,8 +591,7 @@ void MemTable::_aggregate() { template void MemTable::_aggregate_for_flexible_partial_update_without_seq_col( - const ColumnsWithTypeAndName& block_data, MutableBlock& mutable_block, - DorisVector>& temp_row_in_blocks) { + MutableBlock& mutable_block, DorisVector>& temp_row_in_blocks) { std::shared_ptr prev_row {nullptr}; int row_pos = -1; auto& skip_bitmaps = @@ -609,12 +606,12 @@ void MemTable::_aggregate_for_flexible_partial_update_without_seq_col( auto finalize_rows = [&]() { if (row_with_delete_sign != nullptr) { temp_row_in_blocks.push_back(row_with_delete_sign); - _finalize_one_row(row_with_delete_sign.get(), block_data, ++row_pos); + _finalize_one_row(row_with_delete_sign.get(), mutable_block, ++row_pos); row_with_delete_sign = nullptr; } if (row_without_delete_sign != nullptr) { temp_row_in_blocks.push_back(row_without_delete_sign); - _finalize_one_row(row_without_delete_sign.get(), block_data, ++row_pos); + _finalize_one_row(row_without_delete_sign.get(), mutable_block, ++row_pos); row_without_delete_sign = nullptr; } // _arena.clear(); @@ -670,15 +667,14 @@ void MemTable::_aggregate_for_flexible_partial_update_without_seq_col( template void MemTable::_aggregate_for_flexible_partial_update_with_seq_col( - const ColumnsWithTypeAndName& block_data, MutableBlock& mutable_block, - DorisVector>& temp_row_in_blocks) { + MutableBlock& mutable_block, DorisVector>& temp_row_in_blocks) { // For flexible partial update, when table has sequence column, we don't do any aggregation // in memtable. These duplicate rows will be aggregated in VerticalSegmentWriter int row_pos = -1; for (const auto& row_ptr : *_row_in_blocks) { RowInBlock* row = row_ptr.get(); temp_row_in_blocks.push_back(row_ptr); - _finalize_one_row(row, block_data, ++row_pos); + _finalize_one_row(row, mutable_block, ++row_pos); } } diff --git a/be/src/load/memtable/memtable.h b/be/src/load/memtable/memtable.h index 42f96dd4f5f769..ad20667527fed1 100644 --- a/be/src/load/memtable/memtable.h +++ b/be/src/load/memtable/memtable.h @@ -262,7 +262,7 @@ class MemTable { void _sort_one_column(DorisVector>& row_in_blocks, Tie& tie, std::function cmp); template - void _finalize_one_row(RowInBlock* row, const ColumnsWithTypeAndName& block_data, int row_pos); + void _finalize_one_row(RowInBlock* row, MutableBlock& mutable_block, int row_pos); void _init_row_for_agg(RowInBlock* row, MutableBlock& mutable_block); void _clear_row_agg(RowInBlock* row); @@ -271,12 +271,12 @@ class MemTable { template void _aggregate_for_flexible_partial_update_without_seq_col( - const ColumnsWithTypeAndName& block_data, MutableBlock& mutable_block, + MutableBlock& mutable_block, DorisVector>& temp_row_in_blocks); template void _aggregate_for_flexible_partial_update_with_seq_col( - const ColumnsWithTypeAndName& block_data, MutableBlock& mutable_block, + MutableBlock& mutable_block, DorisVector>& temp_row_in_blocks); Status _put_into_output(Block& in_block); diff --git a/be/src/runtime/query_cache/query_cache.cpp b/be/src/runtime/query_cache/query_cache.cpp index d79acfa7ef788d..06817adf1544ce 100644 --- a/be/src/runtime/query_cache/query_cache.cpp +++ b/be/src/runtime/query_cache/query_cache.cpp @@ -17,6 +17,8 @@ #include "runtime/query_cache/query_cache.h" +#include "common/logging.h" + namespace doris { std::vector* QueryCacheHandle::get_cache_slot_orders() { @@ -43,7 +45,10 @@ void QueryCache::insert(const CacheKey& key, int64_t version, CacheResult& res, CacheResult cache_result; for (auto& block_data : res) { cache_result.emplace_back(Block::create_unique())->swap(block_data->clone_empty()); - (void)MutableBlock(cache_result.back().get()).merge(*block_data); + MutableBlock mutable_block(cache_result.back().get()); + auto st = mutable_block.merge(*block_data); + DORIS_CHECK(st.ok()); + cache_result.back()->set_columns(std::move(mutable_block.mutable_columns())); } auto cache_value_ptr = std::make_unique(version, std::move(cache_result), slot_orders); diff --git a/be/src/runtime/result_block_buffer.cpp b/be/src/runtime/result_block_buffer.cpp index ba7f135ce762d5..aebea97ea1ee90 100644 --- a/be/src/runtime/result_block_buffer.cpp +++ b/be/src/runtime/result_block_buffer.cpp @@ -214,10 +214,12 @@ Status ResultBlockBuffer::add_batch(RuntimeState* state, (batch_size + _last_batch_bytes) <= config::thrift_max_message_size) { if constexpr (std::is_same_v) { auto last_block = _result_batch_queue.back(); + auto mutable_columns = last_block->mutate_columns(); for (size_t i = 0; i < last_block->columns(); i++) { - last_block->mutate_columns()[i]->insert_range_from( - *result->get_by_position(i).column, 0, num_rows); + mutable_columns[i]->insert_range_from(*result->get_by_position(i).column, 0, + num_rows); } + last_block->set_columns(std::move(mutable_columns)); } else { std::vector& back_rows = _result_batch_queue.back()->result_batch.rows; diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index af34a3fe1d4cfc..0d904a9107abd2 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -54,6 +54,7 @@ #include "storage/segment/column_reader.h" #include "storage/tablet/tablet_schema.h" #include "storage/utils.h" +#include "util/defer_op.h" #include "util/jsonb/serialize.h" #include "util/lru_cache.h" #include "util/simd/bits.h" @@ -498,96 +499,98 @@ Status PointQueryExecutor::_lookup_row_key() { Status PointQueryExecutor::_lookup_row_data() { // 3. get values SCOPED_TIMER(&_profile_metrics.lookup_data_ns); - for (size_t i = 0; i < _row_read_ctxs.size(); ++i) { - if (_row_read_ctxs[i]._cached_row_data.valid()) { - RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_block( - _reusable->get_data_type_serdes(), - _row_read_ctxs[i]._cached_row_data.data().data, - _row_read_ctxs[i]._cached_row_data.data().size, _reusable->get_col_uid_to_idx(), - *_result_block, _reusable->get_col_default_values(), - _reusable->include_col_uids())); - continue; - } - if (!_row_read_ctxs[i]._row_location.has_value()) { - continue; - } - std::string value; - // fill block by row store - if (_reusable->rs_column_uid() != -1) { - bool use_row_cache = !config::disable_storage_row_cache; - RETURN_IF_ERROR(_tablet->lookup_row_data( - _row_read_ctxs[i]._primary_key, _row_read_ctxs[i]._row_location.value(), - *(_row_read_ctxs[i]._rowset_ptr), _profile_metrics.read_stats, value, - use_row_cache)); - // serilize value to block, currently only jsonb row formt - RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_block( - _reusable->get_data_type_serdes(), value.data(), value.size(), - _reusable->get_col_uid_to_idx(), *_result_block, - _reusable->get_col_default_values(), _reusable->include_col_uids())); - } - if (!_reusable->missing_col_uids().empty()) { - if (!_reusable->runtime_state()->enable_short_circuit_query_access_column_store()) { - std::string missing_columns; - for (int cid : _reusable->missing_col_uids()) { - missing_columns += _tablet->tablet_schema()->column_by_uid(cid).name() + ","; - } - return Status::InternalError( - "Not support column store, set store_row_column=true or row_store_columns " - "in table " - "properties, missing columns: " + - missing_columns + " should be added to row store"); + { + MutableColumns result_columns = _result_block->mutate_columns(); + Defer restore_columns([&]() { _result_block->set_columns(std::move(result_columns)); }); + for (size_t i = 0; i < _row_read_ctxs.size(); ++i) { + if (_row_read_ctxs[i]._cached_row_data.valid()) { + RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_columns( + _reusable->get_data_type_serdes(), + _row_read_ctxs[i]._cached_row_data.data().data, + _row_read_ctxs[i]._cached_row_data.data().size, + _reusable->get_col_uid_to_idx(), result_columns, + _reusable->get_col_default_values(), _reusable->include_col_uids())); + continue; + } + if (!_row_read_ctxs[i]._row_location.has_value()) { + continue; } - // fill missing columns by column store - RowLocation row_loc = _row_read_ctxs[i]._row_location.value(); - BetaRowsetSharedPtr rowset = - std::static_pointer_cast(_tablet->get_rowset(row_loc.rowset_id)); - SegmentCacheHandle segment_cache; - { - SCOPED_TIMER(&_profile_metrics.load_segment_data_stage_ns); - RETURN_IF_ERROR( - SegmentLoader::instance()->load_segments(rowset, &segment_cache, true)); + std::string value; + // fill block by row store + if (_reusable->rs_column_uid() != -1) { + bool use_row_cache = !config::disable_storage_row_cache; + RETURN_IF_ERROR(_tablet->lookup_row_data( + _row_read_ctxs[i]._primary_key, _row_read_ctxs[i]._row_location.value(), + *(_row_read_ctxs[i]._rowset_ptr), _profile_metrics.read_stats, value, + use_row_cache)); + // serialize value to block, currently only jsonb row format + RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_columns( + _reusable->get_data_type_serdes(), value.data(), value.size(), + _reusable->get_col_uid_to_idx(), result_columns, + _reusable->get_col_default_values(), _reusable->include_col_uids())); } - // find segment - auto it = std::find_if(segment_cache.get_segments().cbegin(), - segment_cache.get_segments().cend(), - [&](const segment_v2::SegmentSharedPtr& seg) { - return seg->id() == row_loc.segment_id; - }); - const auto& segment = *it; - for (int cid : _reusable->missing_col_uids()) { - int pos = _reusable->get_col_uid_to_idx().at(cid); - std::vector row_ids { - static_cast(row_loc.row_id)}; - MutableColumnPtr column = - _result_block->get_by_position(pos).column->assume_mutable(); - std::unique_ptr iter; - SlotDescriptor* slot = _reusable->tuple_desc()->slots()[pos]; - StorageReadOptions storage_read_options; - storage_read_options.stats = &_read_stats; - storage_read_options.io_ctx.reader_type = ReaderType::READER_QUERY; - RETURN_IF_ERROR(segment->seek_and_read_by_rowid(*_tablet->tablet_schema(), slot, - row_ids, column, - storage_read_options, iter)); - if (_tablet->tablet_schema() - ->column_by_uid(slot->col_unique_id()) - .has_char_type()) { - column->shrink_padding_chars(); + if (!_reusable->missing_col_uids().empty()) { + if (!_reusable->runtime_state()->enable_short_circuit_query_access_column_store()) { + std::string missing_columns; + for (int cid : _reusable->missing_col_uids()) { + missing_columns += + _tablet->tablet_schema()->column_by_uid(cid).name() + ","; + } + return Status::InternalError( + "Not support column store, set store_row_column=true or " + "row_store_columns in table properties, missing columns: " + + missing_columns + " should be added to row store"); + } + // fill missing columns by column store + RowLocation row_loc = _row_read_ctxs[i]._row_location.value(); + BetaRowsetSharedPtr rowset = std::static_pointer_cast( + _tablet->get_rowset(row_loc.rowset_id)); + SegmentCacheHandle segment_cache; + { + SCOPED_TIMER(&_profile_metrics.load_segment_data_stage_ns); + RETURN_IF_ERROR( + SegmentLoader::instance()->load_segments(rowset, &segment_cache, true)); + } + // find segment + auto it = std::find_if(segment_cache.get_segments().cbegin(), + segment_cache.get_segments().cend(), + [&](const segment_v2::SegmentSharedPtr& seg) { + return seg->id() == row_loc.segment_id; + }); + const auto& segment = *it; + for (int cid : _reusable->missing_col_uids()) { + int pos = _reusable->get_col_uid_to_idx().at(cid); + auto row_id = static_cast(row_loc.row_id); + auto& column = result_columns[pos]; + std::unique_ptr iter; + SlotDescriptor* slot = _reusable->tuple_desc()->slots()[pos]; + StorageReadOptions storage_read_options; + storage_read_options.stats = &_read_stats; + storage_read_options.io_ctx.reader_type = ReaderType::READER_QUERY; + auto st = + segment->seek_and_read_by_rowid(*_tablet->tablet_schema(), slot, row_id, + column, storage_read_options, iter); + if (st.ok() && _tablet->tablet_schema() + ->column_by_uid(slot->col_unique_id()) + .has_char_type()) { + column->shrink_padding_chars(); + } + RETURN_IF_ERROR(st); } } } - } - if (_result_block->columns() > _reusable->include_col_uids().size()) { - // Padding rows for some columns that no need to output to mysql client - // eg. SELECT k1,v1,v2 FROM TABLE WHERE k1 = 1, k1 is not in output slots, tuple as bellow - // TupleDescriptor{id=1, tbl=table_with_column_group} - // SlotDescriptor{id=8, col=v1, colUniqueId=1 ...} - // SlotDescriptor{id=9, col=v2, colUniqueId=2 ...} - // thus missing in include_col_uids and missing_col_uids - for (size_t i = 0; i < _result_block->columns(); ++i) { - auto column = _result_block->get_by_position(i).column; - int padding_rows = _row_hits - cast_set(column->size()); - if (padding_rows > 0) { - column->assume_mutable()->insert_many_defaults(padding_rows); + if (result_columns.size() > _reusable->include_col_uids().size()) { + // Padding rows for some columns that no need to output to mysql client + // eg. SELECT k1,v1,v2 FROM TABLE WHERE k1 = 1, k1 is not in output slots, tuple as bellow + // TupleDescriptor{id=1, tbl=table_with_column_group} + // SlotDescriptor{id=8, col=v1, colUniqueId=1 ...} + // SlotDescriptor{id=9, col=v2, colUniqueId=2 ...} + // thus missing in include_col_uids and missing_col_uids + for (auto& column : result_columns) { + int padding_rows = _row_hits - cast_set(column->size()); + if (padding_rows > 0) { + column->insert_many_defaults(padding_rows); + } } } } diff --git a/be/src/storage/iterator/block_reader.cpp b/be/src/storage/iterator/block_reader.cpp index e50ca8a9c831b1..82358ca7c85899 100644 --- a/be/src/storage/iterator/block_reader.cpp +++ b/be/src/storage/iterator/block_reader.cpp @@ -400,6 +400,7 @@ Status BlockReader::_replace_key_next_block(Block* block, bool* eof) { } } _merged_rows += merged_row; + block->set_columns(std::move(target_columns)); return Status::OK(); } @@ -580,9 +581,10 @@ Status BlockReader::_unique_key_next_block(Block* block, bool* eof) { LOG(WARNING) << "tablet_id: " << tablet()->tablet_id() << " delete sign idx " << delete_sign_idx << " not invalid, skip filter delete in base compaction"; + block->set_columns(std::move(target_columns)); return Status::OK(); } - MutableColumnPtr delete_filter_column = (*std::move(_delete_filter_column)).mutate(); + auto delete_filter_column = IColumn::mutate(std::move(_delete_filter_column)); reinterpret_cast(delete_filter_column.get())->resize(target_block_row); auto* __restrict filter_data = @@ -603,6 +605,7 @@ Status BlockReader::_unique_key_next_block(Block* block, bool* eof) { } } auto target_columns_size = target_columns.size(); + _delete_filter_column = std::move(delete_filter_column); ColumnWithTypeAndName column_with_type_and_name {_delete_filter_column, std::make_shared(), "__DORIS_COMPACTION_FILTER__"}; diff --git a/be/src/storage/iterator/vertical_block_reader.cpp b/be/src/storage/iterator/vertical_block_reader.cpp index aa90c83ccb0a3d..335584997f0f92 100644 --- a/be/src/storage/iterator/vertical_block_reader.cpp +++ b/be/src/storage/iterator/vertical_block_reader.cpp @@ -413,6 +413,7 @@ Status VerticalBlockReader::_agg_key_next_block(Block* block, bool* eof) { break; } LOG(WARNING) << "next failed: " << res; + block->set_columns(std::move(target_columns)); return res; } DCHECK(_next_row.block->columns() == block->columns()); @@ -484,11 +485,12 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { int delete_sign_idx = block->columns() - 1; DCHECK(delete_sign_idx > 0); auto target_columns = block->mutate_columns(); - MutableColumnPtr delete_filter_column = (*std::move(_delete_filter_column)).mutate(); - reinterpret_cast(delete_filter_column.get())->resize(block_rows); + auto delete_filter_column = IColumn::mutate(std::move(_delete_filter_column)); + auto* delete_filter_data_column = + reinterpret_cast(delete_filter_column.get()); + delete_filter_data_column->resize(block_rows); - auto* __restrict filter_data = - reinterpret_cast(delete_filter_column.get())->get_data().data(); + auto* __restrict filter_data = delete_filter_data_column->get_data().data(); auto* __restrict delete_data = reinterpret_cast(target_columns[delete_sign_idx].get()) ->get_data() @@ -517,12 +519,14 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { row_source_idx++; } + const auto column_to_keep = target_columns.size(); + block->set_columns(std::move(target_columns)); + _delete_filter_column = std::move(delete_filter_column); ColumnWithTypeAndName column_with_type_and_name {_delete_filter_column, std::make_shared(), "__DORIS_COMPACTION_FILTER__"}; block->insert(column_with_type_and_name); - RETURN_IF_ERROR( - Block::filter_block(block, target_columns.size(), target_columns.size())); + RETURN_IF_ERROR(Block::filter_block(block, column_to_keep, column_to_keep)); _stats.rows_del_filtered += block_rows - block->rows(); if (UNLIKELY(_reader_context.record_rowids)) { DCHECK_EQ(_block_row_locations.size(), block->rows() + delete_count); @@ -562,6 +566,7 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { RETURN_IF_ERROR(mask_iter->unique_key_next_batch(&batches, _reader_context.batch_size, &actual_rows)); if (actual_rows == 0) { + block->set_columns(std::move(target_columns)); *eof = true; _eof = true; return Status::OK(); @@ -605,6 +610,7 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { break; } LOG(WARNING) << "next failed: " << res; + block->set_columns(std::move(target_columns)); return res; } const auto& src_block = _next_row.block; diff --git a/be/src/storage/partial_update_info.cpp b/be/src/storage/partial_update_info.cpp index 05d648d4819b15..3342899de71b6f 100644 --- a/be/src/storage/partial_update_info.cpp +++ b/be/src/storage/partial_update_info.cpp @@ -39,6 +39,18 @@ #include "storage/utils.h" namespace doris { +namespace { + +ColumnBitmap* get_mutable_skip_bitmap_column(Block* block, size_t skip_bitmap_col_idx) { + auto skip_bitmap_column = + IColumn::mutate(std::move(block->get_by_position(skip_bitmap_col_idx).column)); + auto* skip_bitmap_column_ptr = assert_cast(skip_bitmap_column.get()); + block->replace_by_position(skip_bitmap_col_idx, std::move(skip_bitmap_column)); + return skip_bitmap_column_ptr; +} + +} // namespace + Status PartialUpdateInfo::init(int64_t tablet_id, int64_t txn_id, const TabletSchema& tablet_schema, UniqueKeyUpdateModePB unique_key_update_mode, PartialUpdateNewRowPolicyPB policy, @@ -326,7 +338,10 @@ Status FixedReadPlan::read_columns_by_plan( } } bool has_row_column = tablet_schema.has_row_store_for_all_columns(); - auto mutable_columns = block.mutate_columns(); + MutableColumns mutable_columns; + if (!has_row_column) { + mutable_columns = block.mutate_columns(); + } uint32_t read_idx = 0; for (const auto& [rowset_id, segment_row_mappings] : plan) { for (const auto& [segment_id, mappings] : segment_row_mappings) { @@ -361,7 +376,9 @@ Status FixedReadPlan::read_columns_by_plan( } } } - block.set_columns(std::move(mutable_columns)); + if (!has_row_column) { + block.set_columns(std::move(mutable_columns)); + } return Status::OK(); } @@ -806,8 +823,7 @@ void BlockAggregator::merge_one_row(MutableBlock& dst_block, Block* src_block, i ->get_data() .back(); const auto& new_row_skip_bitmap = - assert_cast( - src_block->get_by_position(cid).column->assume_mutable().get()) + assert_cast(src_block->get_by_position(cid).column.get()) ->get_data()[rid]; cur_skip_bitmap &= new_row_skip_bitmap; continue; @@ -952,11 +968,9 @@ Status BlockAggregator::aggregate_for_sequence_column( DCHECK_EQ(block->columns(), _tablet_schema.num_columns()); // the process logic here is the same as MemTable::_aggregate_for_flexible_partial_update_without_seq_col() // after this function, there will be at most 2 rows for a specified key - std::vector* skip_bitmaps = &( - assert_cast(block->get_by_position(_tablet_schema.skip_bitmap_col_idx()) - .column->assume_mutable() - .get()) - ->get_data()); + std::vector* skip_bitmaps = + &get_mutable_skip_bitmap_column(block, _tablet_schema.skip_bitmap_col_idx()) + ->get_data(); const auto* delete_signs = BaseTablet::get_delete_sign_column_data(*block, num_rows); auto filtered_block = _tablet_schema.create_block(); @@ -1025,11 +1039,9 @@ Status BlockAggregator::aggregate_for_insert_after_delete( // there will be at most 2 rows for a specified key in block when control flow reaches here // after this function, there will not be duplicate rows in block - std::vector* skip_bitmaps = &( - assert_cast(block->get_by_position(_tablet_schema.skip_bitmap_col_idx()) - .column->assume_mutable() - .get()) - ->get_data()); + std::vector* skip_bitmaps = + &get_mutable_skip_bitmap_column(block, _tablet_schema.skip_bitmap_col_idx()) + ->get_data(); const auto* delete_signs = BaseTablet::get_delete_sign_column_data(*block, num_rows); auto filter_column = ColumnUInt8::create(num_rows, 1); diff --git a/be/src/storage/schema_change/schema_change.cpp b/be/src/storage/schema_change/schema_change.cpp index 009231f2da7b01..9cd6cd702a9900 100644 --- a/be/src/storage/schema_change/schema_change.cpp +++ b/be/src/storage/schema_change/schema_change.cpp @@ -169,14 +169,18 @@ class MultiBlockMerger { if (i == rows - 1 || _cmp.compare(row_refs[i], row_refs[i + 1])) { for (int j = 0; j < key_number; j++) { - finalized_block.get_by_position(j).column->assume_mutable()->insert_from( - *row_ref.get_column(j), row_ref.position); + auto& column_ptr = finalized_block.get_by_position(j).column; + auto column = column_ptr->assume_mutable(); + column->insert_from(*row_ref.get_column(j), row_ref.position); + column_ptr = std::move(column); } for (int j = key_number; j < columns; j++) { + auto& column_ptr = finalized_block.get_by_position(j).column; + auto column = column_ptr->assume_mutable(); agg_functions[j - key_number]->insert_result_into( - agg_places[j - key_number], - finalized_block.get_by_position(j).column->assume_mutable_ref()); + agg_places[j - key_number], *column); + column_ptr = std::move(column); agg_functions[j - key_number]->reset(agg_places[j - key_number]); } @@ -222,12 +226,14 @@ class MultiBlockMerger { int limit = std::min(ALTER_TABLE_BATCH_SIZE, rows - i); for (int idx = 0; idx < columns; idx++) { - auto column = finalized_block.get_by_position(idx).column->assume_mutable(); + auto& column_ptr = finalized_block.get_by_position(idx).column; + auto column = column_ptr->assume_mutable(); for (int j = 0; j < limit; j++) { auto row_ref = pushed_row_refs[i + j]; column->insert_from(*row_ref.get_column(idx), row_ref.position); } + column_ptr = std::move(column); } RETURN_IF_ERROR(rowset_writer->add_block(&finalized_block)); finalized_block.clear_column_data(); @@ -379,6 +385,7 @@ Status BlockChanger::change_block(Block* ref_block, Block* new_block) const { column = column->convert_to_predicate_column_if_dictionary(); column->insert_duplicate_fields(value, row_num); } + new_block->get_by_position(idx).column = std::move(column); } else { // same type, just swap column swap_idx_list.emplace_back(_schema_mapping[idx].ref_column_idx, idx); @@ -395,21 +402,20 @@ Status BlockChanger::change_block(Block* ref_block, Block* new_block) const { if (ref_col_nullable != new_col_nullable) { // not nullable to nullable if (new_col_nullable) { - auto* new_nullable_col = - assert_cast(new_col->assume_mutable().get()); + auto mutable_new_col = new_col->assume_mutable(); + auto* new_nullable_col = assert_cast(mutable_new_col.get()); new_nullable_col->change_nested_column(ref_col); new_nullable_col->get_null_map_data().resize_fill(ref_col->size()); + new_col = std::move(mutable_new_col); } else { // nullable to not nullable: // suppose column `c_phone` is originally varchar(16) NOT NULL, // then do schema change `alter table test modify column c_phone int not null`, // the cast expr of schema change is `CastExpr(CAST String to Nullable(Int32))`, // so need to handle nullable to not nullable here - auto* ref_nullable_col = - assert_cast(ref_col->assume_mutable().get()); - - new_col = ref_nullable_col->get_nested_column_ptr(); + const auto& ref_nullable_col = assert_cast(*ref_col); + new_col = ref_nullable_col.get_nested_column_ptr(); } } else { new_block->get_by_position(it.second).column = diff --git a/be/src/storage/segment/column_reader.cpp b/be/src/storage/segment/column_reader.cpp index 630a60d6f9f8ef..02c59ce6cb62b1 100644 --- a/be/src/storage/segment/column_reader.cpp +++ b/be/src/storage/segment/column_reader.cpp @@ -77,6 +77,7 @@ #include "util/bitmap.h" #include "util/block_compression.h" #include "util/concurrency_stats.h" +#include "util/defer_op.h" #include "util/rle_encoding.h" // for RleDecoder #include "util/slice.h" @@ -996,7 +997,8 @@ Status MapFileColumnIterator::next_batch(size_t* n, MutableColumnPtr& dst, bool* auto& column_map = assert_cast( dst->is_nullable() ? static_cast(*dst).get_nested_column() : *dst); - auto column_offsets_ptr = column_map.get_offsets_column().assume_mutable(); + auto column_offsets_ptr = IColumn::mutate(std::move(column_map.get_offsets_ptr())); + Defer defer_offsets {[&] { column_map.get_offsets_ptr() = std::move(column_offsets_ptr); }}; bool offsets_has_null = false; ssize_t start = column_offsets_ptr->size(); RETURN_IF_ERROR(_offsets_iterator->next_batch(n, column_offsets_ptr, &offsets_has_null)); @@ -1008,10 +1010,12 @@ Status MapFileColumnIterator::next_batch(size_t* n, MutableColumnPtr& dst, bool* DCHECK(column_offsets.get_data().back() >= column_offsets.get_data()[start - 1]); size_t num_items = column_offsets.get_data().back() - column_offsets.get_data()[start - 1]; // -1 is valid - auto key_ptr = column_map.get_keys().assume_mutable(); - auto val_ptr = column_map.get_values().assume_mutable(); if (num_items > 0) { + auto key_ptr = IColumn::mutate(std::move(column_map.get_keys_ptr())); + auto val_ptr = IColumn::mutate(std::move(column_map.get_values_ptr())); + Defer defer_keys {[&] { column_map.get_keys_ptr() = std::move(key_ptr); }}; + Defer defer_values {[&] { column_map.get_values_ptr() = std::move(val_ptr); }}; if (read_offset_only()) { // OFFSET_ONLY mode: skip reading actual key/value data, fill with defaults key_ptr->insert_many_defaults(num_items); @@ -1024,9 +1028,6 @@ Status MapFileColumnIterator::next_batch(size_t* n, MutableColumnPtr& dst, bool* RETURN_IF_ERROR(_val_iterator->next_batch(&num_read, val_ptr, &val_has_null)); DCHECK(num_read == num_items); } - - column_map.get_keys_ptr() = std::move(key_ptr); - column_map.get_values_ptr() = std::move(val_ptr); } if (dst->is_nullable()) { @@ -1081,9 +1082,10 @@ Status MapFileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t return Status::OK(); } // resolve ColumnMap and nullable wrapper - const auto& column_map = assert_cast( + auto& column_map = assert_cast( dst->is_nullable() ? static_cast(*dst).get_nested_column() : *dst); - auto offsets_ptr = column_map.get_offsets_column().assume_mutable(); + auto offsets_ptr = IColumn::mutate(std::move(column_map.get_offsets_ptr())); + Defer defer_offsets {[&] { column_map.get_offsets_ptr() = std::move(offsets_ptr); }}; auto& offsets = static_cast(*offsets_ptr); size_t base = offsets.get_data().empty() ? 0 : offsets.get_data().back(); @@ -1167,8 +1169,10 @@ Status MapFileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t } // 6. read key/value elements for non-empty sizes - auto keys_ptr = column_map.get_keys().assume_mutable(); - auto vals_ptr = column_map.get_values().assume_mutable(); + auto keys_ptr = IColumn::mutate(std::move(column_map.get_keys_ptr())); + auto vals_ptr = IColumn::mutate(std::move(column_map.get_values_ptr())); + Defer defer_keys {[&] { column_map.get_keys_ptr() = std::move(keys_ptr); }}; + Defer defer_values {[&] { column_map.get_values_ptr() = std::move(vals_ptr); }}; size_t this_run = sizes[0]; auto start_idx = starts_data[0]; @@ -1413,12 +1417,13 @@ Status StructFileColumnIterator::next_batch(size_t* n, MutableColumnPtr& dst, bo dst->is_nullable() ? static_cast(*dst).get_nested_column() : *dst); for (size_t i = 0; i < column_struct.tuple_size(); i++) { size_t num_read = *n; - auto sub_column_ptr = column_struct.get_column(i).assume_mutable(); + auto sub_column_ptr = IColumn::mutate(std::move(column_struct.get_column_ptr(i))); + Defer defer_sub_column { + [&] { column_struct.get_column_ptr(i) = std::move(sub_column_ptr); }}; bool column_has_null = false; RETURN_IF_ERROR( _sub_column_iterators[i]->next_batch(&num_read, sub_column_ptr, &column_has_null)); DCHECK(num_read == *n); - column_struct.get_column_ptr(i) = std::move(sub_column_ptr); } if (dst->is_nullable()) { @@ -1773,11 +1778,12 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, MutableColumnPtr& dst, boo return Status::OK(); } - const auto& column_array = assert_cast( + auto& column_array = assert_cast( dst->is_nullable() ? static_cast(*dst).get_nested_column() : *dst); bool offsets_has_null = false; - auto column_offsets_ptr = column_array.get_offsets_column().assume_mutable(); + auto column_offsets_ptr = IColumn::mutate(std::move(column_array.get_offsets_ptr())); + Defer defer_offsets {[&] { column_array.get_offsets_ptr() = std::move(column_offsets_ptr); }}; ssize_t start = column_offsets_ptr->size(); RETURN_IF_ERROR(_offset_iterator->next_batch(n, column_offsets_ptr, &offsets_has_null)); if (*n == 0) { @@ -1787,8 +1793,9 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, MutableColumnPtr& dst, boo RETURN_IF_ERROR(_offset_iterator->_calculate_offsets(start, column_offsets)); size_t num_items = column_offsets.get_data().back() - column_offsets.get_data()[start - 1]; // -1 is valid - auto column_items_ptr = column_array.get_data().assume_mutable(); if (num_items > 0) { + auto column_items_ptr = IColumn::mutate(std::move(column_array.get_data_ptr())); + Defer defer_items {[&] { column_array.get_data_ptr() = std::move(column_items_ptr); }}; if (read_offset_only()) { // OFFSET_ONLY mode: skip reading actual item data, fill with defaults column_items_ptr->insert_many_defaults(num_items); diff --git a/be/src/storage/segment/segment_writer.cpp b/be/src/storage/segment/segment_writer.cpp index edf3ebc81dd0b7..917777f24ebfbd 100644 --- a/be/src/storage/segment/segment_writer.cpp +++ b/be/src/storage/segment/segment_writer.cpp @@ -389,7 +389,7 @@ void SegmentWriter::_maybe_invalid_row_cache(const std::string& key) { } } -void SegmentWriter::_serialize_block_to_row_column(const Block& block) { +void SegmentWriter::_serialize_block_to_row_column(Block& block) { if (block.rows() == 0) { return; } @@ -398,14 +398,14 @@ void SegmentWriter::_serialize_block_to_row_column(const Block& block) { int row_column_id = 0; for (int i = 0; i < _tablet_schema->num_columns(); ++i) { if (_tablet_schema->column(i).is_row_store_column()) { - auto* row_store_column = static_cast( - block.get_by_position(i).column->assume_mutable_ref().assume_mutable().get()); - row_store_column->clear(); + auto row_store_column_ptr = block.get_by_position(i).column->clone_empty(); + auto* row_store_column = static_cast(row_store_column_ptr.get()); DataTypeSerDeSPtrs serdes = create_data_type_serdes(block.get_data_types()); JsonbSerializeUtil::block_to_jsonb(*_tablet_schema, block, *row_store_column, cast_set(_tablet_schema->num_columns()), serdes, {_tablet_schema->row_columns_uids().begin(), _tablet_schema->row_columns_uids().end()}); + block.replace_by_position(i, std::move(row_store_column_ptr)); break; } } @@ -719,7 +719,7 @@ Status SegmentWriter::append_block(const Block* block, size_t row_pos, size_t nu // or it's schema change write(since column data type maybe changed, so we should reubild) if (_opts.write_type == DataWriteType::TYPE_DIRECT || _opts.write_type == DataWriteType::TYPE_SCHEMA_CHANGE) { - _serialize_block_to_row_column(*block); + _serialize_block_to_row_column(*const_cast(block)); } if (_opts.rowset_ctx->write_type != DataWriteType::TYPE_COMPACTION && diff --git a/be/src/storage/segment/segment_writer.h b/be/src/storage/segment/segment_writer.h index 9b6b8b55c3aea1..5623a06a70cf48 100644 --- a/be/src/storage/segment/segment_writer.h +++ b/be/src/storage/segment/segment_writer.h @@ -182,7 +182,7 @@ class SegmentWriter { void set_min_max_key(const Slice& key); void set_min_key(const Slice& key); void set_max_key(const Slice& key); - void _serialize_block_to_row_column(const Block& block); + void _serialize_block_to_row_column(Block& block); Status _generate_primary_key_index( const std::vector& primary_key_coders, const std::vector& primary_key_columns, diff --git a/be/src/storage/segment/variant/binary_column_extract_iterator.h b/be/src/storage/segment/variant/binary_column_extract_iterator.h index a58cf10a6b0e5a..a9aef9ca5aa1af 100644 --- a/be/src/storage/segment/variant/binary_column_extract_iterator.h +++ b/be/src/storage/segment/variant/binary_column_extract_iterator.h @@ -153,8 +153,8 @@ class BinaryColumnExtractIterator : public BaseBinaryColumnProcessor { _sparse_column_cache->binary_column->get_ptr(), 0, _sparse_column_cache->binary_column->size()); var.incr_num_rows(_sparse_column_cache->binary_column->size()); - var.get_sparse_column()->assume_mutable()->resize(var.rows()); - var.get_doc_value_column()->assume_mutable()->resize(var.rows()); + var.get_sparse_column_mutable().resize(var.rows()); + var.get_doc_value_column_mutable().resize(var.rows()); ENABLE_CHECK_CONSISTENCY(&var); } diff --git a/be/src/storage/segment/variant/hierarchical_data_iterator.cpp b/be/src/storage/segment/variant/hierarchical_data_iterator.cpp index cd8cd77696771f..878ca16019b0f4 100644 --- a/be/src/storage/segment/variant/hierarchical_data_iterator.cpp +++ b/be/src/storage/segment/variant/hierarchical_data_iterator.cpp @@ -203,7 +203,7 @@ Status HierarchicalDataIterator::_process_sub_columns( ColumnVariant& container_variant, const PathsWithColumnAndType& non_nested_subcolumns) { for (const auto& entry : non_nested_subcolumns) { DCHECK(!entry.path.has_nested_part()); - bool add = container_variant.add_sub_column(entry.path, entry.column->assume_mutable(), + bool add = container_variant.add_sub_column(entry.path, IColumn::mutate(entry.column), entry.type); if (!add) { return Status::InternalError("Duplicated {}, type {}", entry.path.get_path(), @@ -225,7 +225,7 @@ Status HierarchicalDataIterator::_process_nested_columns( assert_cast(remove_nullable(entry.second[0].column).get()); MutableColumnPtr nested_object = ColumnVariant::create(0, false, base_array->get_data().size()); - MutableColumnPtr offset = base_array->get_offsets_ptr()->assume_mutable(); + MutableColumnPtr offset = IColumn::mutate(base_array->get_offsets_ptr()); auto* nested_object_ptr = assert_cast(nested_object.get()); // flatten nested arrays for (const auto& subcolumn : entry.second) { @@ -246,7 +246,7 @@ Status HierarchicalDataIterator::_process_nested_columns( subcolumn.path.get_path(), subcolumn.type->get_name()); } #endif - MutableColumnPtr flattend_column = target_array->get_data_ptr()->assume_mutable(); + MutableColumnPtr flattend_column = IColumn::mutate(target_array->get_data_ptr()); DataTypePtr flattend_type = check_and_get_data_type(remove_nullable(type).get()) ->get_nested_type(); @@ -255,14 +255,18 @@ Status HierarchicalDataIterator::_process_nested_columns( subcolumn.path.copy_pop_nfront(entry.first.get_parts().size()), std::move(flattend_column), std::move(flattend_type)); } - nested_object = make_nullable(nested_object->get_ptr())->assume_mutable(); - auto array = - make_nullable(ColumnArray::create(std::move(nested_object), std::move(offset))); + const size_t nested_object_size = nested_object->size(); + nested_object = ColumnNullable::create(std::move(nested_object), + ColumnUInt8::create(nested_object_size, 0)); + auto array = ColumnArray::create(std::move(nested_object), std::move(offset)); + const size_t array_size = array->size(); + auto nullable_array = + ColumnNullable::create(std::move(array), ColumnUInt8::create(array_size, 0)); PathInDataBuilder builder; // add parent prefix builder.append(entry.first.get_parts(), false); PathInData parent_path = builder.build(); - container_variant.add_sub_column(parent_path, array->assume_mutable(), + container_variant.add_sub_column(parent_path, std::move(nullable_array), container_variant.NESTED_TYPE); } return Status::OK(); @@ -283,14 +287,17 @@ Status HierarchicalDataIterator::_init_container(MutableColumnPtr& container, si // auto column = root_var.get_root(); // auto type = root_var.get_root_type(); - MutableColumnPtr column = _root_reader->column->get_ptr(); + MutableColumnPtr column = IColumn::mutate(_root_reader->column->get_ptr()); // container_variant.add_sub_column({}, std::move(column), _root_reader->type); DCHECK(column->size() == nrows); - auto nullable_column = make_nullable(column->get_ptr()); + if (!column->is_nullable()) { + const size_t column_size = column->size(); + column = ColumnNullable::create(std::move(column), ColumnUInt8::create(column_size, 0)); + } auto type = make_nullable(_root_reader->type); // make sure the root type is nullable container = ColumnVariant::create(max_subcolumns_count, enable_doc_mode, type, - nullable_column->assume_mutable()); + std::move(column)); } else { DataTypePtr root_type = std::make_shared(); auto column = ColumnNothing::create(nrows); @@ -359,10 +366,10 @@ Status HierarchicalDataIterator::_process_binary_column(ColumnVariant& container if (_path.get_parts().empty()) { if (_read_type == ReadType::SUBCOLUMNS_AND_SPARSE) { container_variant.set_sparse_column(_binary_column_reader->column->get_ptr()); - container_variant.get_doc_value_column()->assume_mutable()->resize(nrows); + container_variant.get_doc_value_column_mutable().resize(nrows); } else if (_read_type == ReadType::DOC_VALUE_COLUMN) { container_variant.set_doc_value_column(_binary_column_reader->column->get_ptr()); - container_variant.get_sparse_column()->assume_mutable()->resize(nrows); + container_variant.get_sparse_column_mutable().resize(nrows); } else { return Status::InternalError("Invalid read type {}", _read_type); } @@ -378,7 +385,7 @@ Status HierarchicalDataIterator::_process_binary_column(ColumnVariant& container const auto& src_values = assert_cast(src_map.get_values()); // Clear pre-initialized doc_value offsets (created by ColumnVariant ctor with num_rows) - container_variant.get_doc_value_column()->assume_mutable()->clear(); + container_variant.get_doc_value_column_mutable().clear(); auto [dst_paths, dst_values] = container_variant.get_doc_value_data_paths_and_values(); auto& dst_offsets = container_variant.serialized_doc_value_column_offsets(); @@ -419,13 +426,13 @@ Status HierarchicalDataIterator::_process_binary_column(ColumnVariant& container } dst_offsets.push_back(dst_paths->size()); } - container_variant.get_sparse_column()->assume_mutable()->resize(nrows); + container_variant.get_sparse_column_mutable().resize(nrows); } else { const auto& offsets = assert_cast(*_binary_column_reader->column).get_offsets(); /// Check if there is no data in shared data in current range. if (offsets.back() == offsets[-1]) { - container_variant.get_sparse_column()->assume_mutable()->resize(nrows); + container_variant.get_sparse_column_mutable().resize(nrows); } else { // Read for variant sparse column // Example path: a.b @@ -444,8 +451,7 @@ Status HierarchicalDataIterator::_process_binary_column(ColumnVariant& container assert_cast(sparse_data_map.get_values()); auto& sparse_data_offsets = - assert_cast( - *container_variant.get_sparse_column()->assume_mutable()) + assert_cast(container_variant.get_sparse_column_mutable()) .get_offsets(); auto [sparse_data_paths, sparse_data_values] = container_variant.get_sparse_data_paths_and_values(); @@ -544,7 +550,7 @@ Status HierarchicalDataIterator::_process_binary_column(ColumnVariant& container } } } - container_variant.get_doc_value_column()->assume_mutable()->resize(nrows); + container_variant.get_doc_value_column_mutable().resize(nrows); } ENABLE_CHECK_CONSISTENCY(&container_variant); return Status::OK(); diff --git a/be/src/storage/segment/variant/hierarchical_data_iterator.h b/be/src/storage/segment/variant/hierarchical_data_iterator.h index ae7f96526a633f..fb8f5cad819bec 100644 --- a/be/src/storage/segment/variant/hierarchical_data_iterator.h +++ b/be/src/storage/segment/variant/hierarchical_data_iterator.h @@ -137,6 +137,7 @@ class HierarchicalDataIterator : public ColumnIterator { // process read template Status process_read(ReadFunction&& read_func, MutableColumnPtr& dst, size_t nrows) { + dst = IColumn::mutate(std::move(dst)); // // Read all sub columns, and merge with root column ColumnNullable* nullable_column = nullptr; if (dst->is_nullable()) { diff --git a/be/src/storage/segment/variant/variant_column_reader.cpp b/be/src/storage/segment/variant/variant_column_reader.cpp index 0008cbca94a6c3..e7913785274dc1 100644 --- a/be/src/storage/segment/variant/variant_column_reader.cpp +++ b/be/src/storage/segment/variant/variant_column_reader.cpp @@ -1587,8 +1587,9 @@ static void fill_nested_with_defaults(MutableColumnPtr& dst, MutableColumnPtr& s } auto new_nested = dst_array->get_data_ptr()->clone_resized(sibling_array->get_data_ptr()->size()); - auto new_array = make_nullable(ColumnArray::create( - new_nested->assume_mutable(), sibling_array->get_offsets_ptr()->assume_mutable())); + ColumnPtr nested_column = std::move(new_nested); + auto new_array = + make_nullable(ColumnArray::create(nested_column, sibling_array->get_offsets_ptr())); dst->insert_range_from(*new_array, 0, new_array->size()); #ifndef NDEBUG if (!dst_array->has_equal_offsets(*sibling_array)) { diff --git a/be/src/storage/segment/variant/variant_column_writer_impl.cpp b/be/src/storage/segment/variant/variant_column_writer_impl.cpp index 95f266e15c44cb..01d01f8c51f203 100644 --- a/be/src/storage/segment/variant/variant_column_writer_impl.cpp +++ b/be/src/storage/segment/variant/variant_column_writer_impl.cpp @@ -1221,8 +1221,14 @@ Status VariantColumnWriterImpl::_process_root_column(ColumnVariant* ptr, DCHECK_EQ(ptr->get_root()->get_ptr()->size(), num_rows); converter->add_column_data_convertor(*_tablet_column); const uint8_t* nullmap = nullptr; - auto& nullable_column = assert_cast(*ptr->get_root()->assume_mutable()); - auto root_column = nullable_column.get_nested_column_ptr(); + // get_root() already returns a MutableColumnPtr; store it to avoid dangling ref and + // to avoid calling assume_mutable() again (which would see use_count>1 and throw). + auto root_mut = ptr->get_root(); + auto& nullable_column = assert_cast(*root_mut); + // Use const access to get the nested column ptr without bumping use_count in the + // non-const chameleon_ptr path, then mutate() to get exclusive ownership. + auto root_column = IColumn::mutate( + static_cast(nullable_column).get_nested_column_ptr()); const bool has_root_ng = std::ranges::any_of(_nested_group_routing_plan.ng_only_prefixes, @@ -1234,13 +1240,15 @@ Status VariantColumnWriterImpl::_process_root_column(ColumnVariant* ptr, // If the root variant is nullable, then update the root column null column with the outer null column. if (_tablet_column->is_nullable()) { // use outer null column as final null column + // Move root_column (exclusive) directly into create() to avoid sharing ownership. root_column = - ColumnNullable::create(root_column->get_ptr(), ColumnUInt8::create(*_null_column)); + ColumnNullable::create(std::move(root_column), ColumnUInt8::create(*_null_column)); nullmap = _null_column->get_data().data(); } else { // Otherwise setting to all not null. - root_column = ColumnNullable::create(root_column->get_ptr(), - ColumnUInt8::create(root_column->size(), 0)); + size_t col_size = root_column->size(); + root_column = + ColumnNullable::create(std::move(root_column), ColumnUInt8::create(col_size, 0)); } // make sure the root_column is nullable RETURN_IF_ERROR(converter->set_source_content_with_specifid_column( diff --git a/be/src/storage/segment/variant/variant_streaming_compaction_writer.cpp b/be/src/storage/segment/variant/variant_streaming_compaction_writer.cpp index 48ea040c7bafe8..3dbe3026a5a634 100644 --- a/be/src/storage/segment/variant/variant_streaming_compaction_writer.cpp +++ b/be/src/storage/segment/variant/variant_streaming_compaction_writer.cpp @@ -140,8 +140,10 @@ Status VariantStreamingCompactionWriter::_append_root_column(const ColumnVariant auto expected_root_type = make_nullable(std::make_shared()); variant->ensure_root_node_type(expected_root_type); - auto& nullable_column = assert_cast(*variant->get_root()->assume_mutable()); - auto root_column = nullable_column.get_nested_column_ptr(); + auto root_mut = variant->get_root(); + auto& nullable_column = assert_cast(*root_mut); + auto root_column = IColumn::mutate( + static_cast(nullable_column).get_nested_column_ptr()); const size_t num_rows = chunk_variant.rows(); variant_writer_helpers::maybe_remove_root_jsonb_with_empty_defaults( &root_column, num_rows, _streaming_plan.can_remove_root_jsonb()); @@ -156,10 +158,11 @@ Status VariantStreamingCompactionWriter::_append_root_column(const ColumnVariant } else { null_column->insert_many_defaults(num_rows); } - root_column = ColumnNullable::create(root_column->get_ptr(), std::move(null_column)); + root_column = ColumnNullable::create(std::move(root_column), std::move(null_column)); } else { - root_column = ColumnNullable::create(root_column->get_ptr(), - ColumnUInt8::create(root_column->size(), 0)); + const size_t root_column_size = root_column->size(); + root_column = ColumnNullable::create(std::move(root_column), + ColumnUInt8::create(root_column_size, 0)); } auto converter = std::make_unique(); diff --git a/be/src/storage/segment/vertical_segment_writer.cpp b/be/src/storage/segment/vertical_segment_writer.cpp index 56d99d7249efa2..fdd84bef48c56b 100644 --- a/be/src/storage/segment/vertical_segment_writer.cpp +++ b/be/src/storage/segment/vertical_segment_writer.cpp @@ -92,6 +92,14 @@ inline std::string vertical_segment_writer_mem_tracker_name(uint32_t segment_id) return "VerticalSegmentWriter:Segment-" + std::to_string(segment_id); } +static ColumnBitmap* get_mutable_skip_bitmap_column(Block* block, size_t skip_bitmap_col_idx) { + auto skip_bitmap_column = + IColumn::mutate(std::move(block->get_by_position(skip_bitmap_col_idx).column)); + auto* skip_bitmap_column_ptr = assert_cast(skip_bitmap_column.get()); + block->replace_by_position(skip_bitmap_col_idx, std::move(skip_bitmap_column)); + return skip_bitmap_column_ptr; +} + VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, DataDir* data_dir, @@ -362,7 +370,7 @@ void VerticalSegmentWriter::_maybe_invalid_row_cache(const std::string& key) con } } -void VerticalSegmentWriter::_serialize_block_to_row_column(const Block& block) { +void VerticalSegmentWriter::_serialize_block_to_row_column(Block& block) { if (block.rows() == 0) { return; } @@ -371,15 +379,15 @@ void VerticalSegmentWriter::_serialize_block_to_row_column(const Block& block) { int row_column_id = 0; for (int i = 0; i < _tablet_schema->num_columns(); ++i) { if (_tablet_schema->column(i).is_row_store_column()) { - auto* row_store_column = static_cast( - block.get_by_position(i).column->assume_mutable_ref().assume_mutable().get()); - row_store_column->clear(); + auto row_store_column_ptr = block.get_by_position(i).column->clone_empty(); + auto* row_store_column = static_cast(row_store_column_ptr.get()); DataTypeSerDeSPtrs serdes = create_data_type_serdes(block.get_data_types()); std::unordered_set row_store_cids_set(_tablet_schema->row_columns_uids().begin(), _tablet_schema->row_columns_uids().end()); JsonbSerializeUtil::block_to_jsonb(*_tablet_schema, block, *row_store_column, cast_set(_tablet_schema->num_columns()), serdes, row_store_cids_set); + block.replace_by_position(i, std::move(row_store_column_ptr)); break; } } @@ -766,10 +774,9 @@ Status VerticalSegmentWriter::_append_block_with_flexible_partial_content(RowsIn RETURN_IF_ERROR(_block_aggregator.convert_seq_column(const_cast(data.block), data.row_pos, data.num_rows, seq_column)); - std::vector* skip_bitmaps = &( - assert_cast( - data.block->get_by_position(skip_bitmap_col_idx).column->assume_mutable().get()) - ->get_data()); + auto* mutable_block = const_cast(data.block); + std::vector* skip_bitmaps = + &get_mutable_skip_bitmap_column(mutable_block, skip_bitmap_col_idx)->get_data(); const auto* delete_signs = BaseTablet::get_delete_sign_column_data(*data.block, data.row_pos + data.num_rows); DCHECK(delete_signs != nullptr); @@ -1010,7 +1017,7 @@ Status VerticalSegmentWriter::write_batch() { _opts.write_type == DataWriteType::TYPE_SCHEMA_CHANGE) { for (auto& data : _batched_blocks) { // TODO: maybe we should pass range to this method - _serialize_block_to_row_column(*data.block); + _serialize_block_to_row_column(*const_cast(data.block)); } } diff --git a/be/src/storage/segment/vertical_segment_writer.h b/be/src/storage/segment/vertical_segment_writer.h index 5c0ec0930e522d..39235811c07880 100644 --- a/be/src/storage/segment/vertical_segment_writer.h +++ b/be/src/storage/segment/vertical_segment_writer.h @@ -158,7 +158,7 @@ class VerticalSegmentWriter { void _set_min_max_key(const Slice& key); void _set_min_key(const Slice& key); void _set_max_key(const Slice& key); - void _serialize_block_to_row_column(const Block& block); + void _serialize_block_to_row_column(Block& block); Status _probe_key_for_mow(std::string key, std::size_t segment_pos, bool have_input_seq_column, bool have_delete_sign, const std::vector& specified_rowsets, diff --git a/be/src/storage/tablet/base_tablet.cpp b/be/src/storage/tablet/base_tablet.cpp index 0bde14b5b31606..98166a20ee071b 100644 --- a/be/src/storage/tablet/base_tablet.cpp +++ b/be/src/storage/tablet/base_tablet.cpp @@ -884,9 +884,10 @@ Status BaseTablet::sort_block(Block& in_block, Block& output_block) { vec_row_comparator->set_block(&mutable_input_block); std::vector> row_in_blocks; - DCHECK(in_block.rows() <= std::numeric_limits::max()); - row_in_blocks.reserve(in_block.rows()); - for (size_t i = 0; i < in_block.rows(); ++i) { + const auto input_rows = mutable_input_block.rows(); + DCHECK(input_rows <= std::numeric_limits::max()); + row_in_blocks.reserve(input_rows); + for (size_t i = 0; i < input_rows; ++i) { row_in_blocks.emplace_back(std::make_unique(i)); } std::sort(row_in_blocks.begin(), row_in_blocks.end(), @@ -898,12 +899,15 @@ Status BaseTablet::sort_block(Block& in_block, Block& output_block) { return value < 0; }); std::vector row_pos_vec; - row_pos_vec.reserve(in_block.rows()); + row_pos_vec.reserve(input_rows); for (auto& block : row_in_blocks) { row_pos_vec.emplace_back(block->_row_pos); } - return mutable_output_block.add_rows(&in_block, row_pos_vec.data(), - row_pos_vec.data() + in_block.rows()); + in_block.set_columns(std::move(mutable_input_block.mutable_columns())); + RETURN_IF_ERROR(mutable_output_block.add_rows(&in_block, row_pos_vec.data(), + row_pos_vec.data() + input_rows)); + output_block.set_columns(std::move(mutable_output_block.mutable_columns())); + return Status::OK(); } // fetch value by row column diff --git a/be/src/util/jsonb/serialize.cpp b/be/src/util/jsonb/serialize.cpp index 0088c6249f0030..6de5c155077c4d 100644 --- a/be/src/util/jsonb/serialize.cpp +++ b/be/src/util/jsonb/serialize.cpp @@ -36,6 +36,7 @@ #include "core/value/jsonb_value.h" #include "runtime/descriptors.h" #include "storage/tablet/tablet_schema.h" +#include "util/defer_op.h" #include "util/jsonb_document.h" #include "util/jsonb_stream.h" #include "util/jsonb_writer.h" @@ -79,51 +80,55 @@ Status JsonbSerializeUtil::jsonb_to_block( const std::unordered_map& col_id_to_idx, Block& dst, const std::vector& default_values, const std::unordered_set& include_cids) { + MutableColumns dst_columns = dst.mutate_columns(); + Defer restore_columns([&]() { dst.set_columns(std::move(dst_columns)); }); for (int i = 0; i < jsonb_column.size(); ++i) { StringRef jsonb_data = jsonb_column.get_data_at(i); - RETURN_IF_ERROR(jsonb_to_block(serdes, jsonb_data.data, jsonb_data.size, col_id_to_idx, dst, - default_values, include_cids)); + RETURN_IF_ERROR(jsonb_to_columns(serdes, jsonb_data.data, jsonb_data.size, col_id_to_idx, + dst_columns, default_values, include_cids)); } return Status::OK(); } -// single row -Status JsonbSerializeUtil::jsonb_to_block( +Status JsonbSerializeUtil::jsonb_to_columns( const DataTypeSerDeSPtrs& serdes, const char* data, size_t size, - const std::unordered_map& col_id_to_idx, Block& dst, + const std::unordered_map& col_id_to_idx, MutableColumns& dst_columns, const std::vector& default_values, const std::unordered_set& include_cids) { const JsonbDocument* pdoc = nullptr; RETURN_IF_ERROR(JsonbDocument::checkAndCreateDocument(data, size, &pdoc)); const JsonbDocument& doc = *pdoc; - size_t num_rows = dst.rows(); + DCHECK(!dst_columns.empty()); + size_t num_rows = dst_columns[0]->size(); size_t filled_columns = 0; for (auto it = doc->begin(); it != doc->end(); ++it) { auto col_it = col_id_to_idx.find(it->getKeyId()); if (col_it != col_id_to_idx.end() && (include_cids.empty() || include_cids.contains(it->getKeyId()))) { - MutableColumnPtr dst_column = - dst.get_by_position(col_it->second).column->assume_mutable(); + auto& dst_column = dst_columns[col_it->second]; serdes[col_it->second]->read_one_cell_from_jsonb(*dst_column, it->value()); ++filled_columns; } } - if (filled_columns >= dst.columns()) { + if (filled_columns >= dst_columns.size()) { return Status::OK(); } - auto fill_column = [&](Block& dst, int pos, size_t old_num_rows) { - MutableColumnPtr dst_column = dst.get_by_position(pos).column->assume_mutable(); + auto fill_column = [&](size_t pos, size_t old_num_rows) { + auto& dst_column = dst_columns[pos]; if (dst_column->size() < old_num_rows + 1) { DCHECK(dst_column->size() == old_num_rows); + Status st = Status::OK(); if (default_values[pos].empty()) { dst_column->insert_default(); } else { Slice value(default_values[pos].data(), default_values[pos].size()); DataTypeSerDe::FormatOptions opt; opt.converted_from_string = true; - RETURN_IF_ERROR( - serdes[pos]->deserialize_one_cell_from_json(*dst_column, value, opt)); + st = serdes[pos]->deserialize_one_cell_from_json(*dst_column, value, opt); } + RETURN_IF_ERROR(st); + DCHECK(dst_column->size() == num_rows + 1); + return Status::OK(); } DCHECK(dst_column->size() == num_rows + 1); return Status::OK(); @@ -135,14 +140,26 @@ Status JsonbSerializeUtil::jsonb_to_block( if (col_it == col_id_to_idx.end()) { continue; } - RETURN_IF_ERROR(fill_column(dst, col_it->second, num_rows)); + RETURN_IF_ERROR(fill_column(static_cast(col_it->second), num_rows)); } } else { - for (int i = 0; i < dst.columns(); ++i) { - RETURN_IF_ERROR(fill_column(dst, i, num_rows)); + for (size_t i = 0; i < dst_columns.size(); ++i) { + RETURN_IF_ERROR(fill_column(i, num_rows)); } } return Status::OK(); } -} // namespace doris \ No newline at end of file +// single row +Status JsonbSerializeUtil::jsonb_to_block( + const DataTypeSerDeSPtrs& serdes, const char* data, size_t size, + const std::unordered_map& col_id_to_idx, Block& dst, + const std::vector& default_values, + const std::unordered_set& include_cids) { + MutableColumns dst_columns = dst.mutate_columns(); + Defer restore_columns([&]() { dst.set_columns(std::move(dst_columns)); }); + return jsonb_to_columns(serdes, data, size, col_id_to_idx, dst_columns, default_values, + include_cids); +} + +} // namespace doris diff --git a/be/src/util/jsonb/serialize.h b/be/src/util/jsonb/serialize.h index f19474abe939fc..e25ecc00af4e10 100644 --- a/be/src/util/jsonb/serialize.h +++ b/be/src/util/jsonb/serialize.h @@ -46,10 +46,16 @@ class JsonbSerializeUtil { const std::unordered_map& col_id_to_idx, Block& dst, const std::vector& default_values, const std::unordered_set& include_cids); + // append single row into mutable columns + static Status jsonb_to_columns(const DataTypeSerDeSPtrs& serdes, const char* data, size_t size, + const std::unordered_map& col_id_to_idx, + MutableColumns& dst_columns, + const std::vector& default_values, + const std::unordered_set& include_cids); // single row static Status jsonb_to_block(const DataTypeSerDeSPtrs& serdes, const char* data, size_t size, const std::unordered_map& col_id_to_idx, Block& dst, const std::vector& default_values, const std::unordered_set& include_cids); }; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/core/block/block_test.cpp b/be/test/core/block/block_test.cpp index ff80cc4c425de9..26f8dc91ec3d9f 100644 --- a/be/test/core/block/block_test.cpp +++ b/be/test/core/block/block_test.cpp @@ -1291,4 +1291,29 @@ TEST(BlockTest, others) { ASSERT_TRUE(dumped_names.empty()) << "Dumped names: " << dumped_names; } +TEST(BlockTest, ClearSelectedColumnDataClonesSharedColumn) { + auto type = std::make_shared(); + auto mutable_col0 = ColumnInt32::create(); + mutable_col0->insert_value(1); + mutable_col0->insert_value(2); + ColumnPtr old_col0 = mutable_col0->get_ptr(); + + auto mutable_col1 = ColumnInt32::create(); + mutable_col1->insert_value(10); + mutable_col1->insert_value(20); + ColumnPtr old_col1 = mutable_col1->get_ptr(); + + Block block; + block.insert({std::move(mutable_col0), type, "c0"}); + block.insert({std::move(mutable_col1), type, "c1"}); + + block.clear_column_data(std::vector {0}); + + EXPECT_EQ(block.get_by_position(0).column->size(), 0); + EXPECT_EQ(old_col0->size(), 2); + EXPECT_NE(block.get_by_position(0).column.get(), old_col0.get()); + EXPECT_EQ(block.get_by_position(1).column->size(), 2); + EXPECT_EQ(block.get_by_position(1).column.get(), old_col1.get()); +} + } // namespace doris diff --git a/be/test/core/block/column_map_test.cpp b/be/test/core/block/column_map_test.cpp index 0be1bf8fc8c60d..c0b8930695ddd4 100644 --- a/be/test/core/block/column_map_test.cpp +++ b/be/test/core/block/column_map_test.cpp @@ -347,6 +347,120 @@ TEST(ColumnMapTest2, StringKeyTestDuplicatedKeysNestedMap) { ASSERT_EQ(v2_values[1].get(), 333); }; +TEST(ColumnMapTest2, SharedCreatePreservesImmutableSubcolumns) { + auto keys_mut = ColumnString::create(); + keys_mut->insert_data("k", 1); + ColumnPtr keys = std::move(keys_mut); + ColumnPtr keys_alias = keys; + + auto values_mut = ColumnInt32::create(); + values_mut->insert_value(1); + ColumnPtr values = std::move(values_mut); + ColumnPtr values_alias = values; + + auto offsets_mut = ColumnArray::ColumnOffsets::create(); + offsets_mut->get_data().push_back(1); + ColumnPtr offsets = std::move(offsets_mut); + ColumnPtr offsets_alias = offsets; + + auto map_column = ColumnMap::create(keys, values, offsets); + EXPECT_EQ(map_column->get_keys_ptr().get(), keys_alias.get()); + EXPECT_EQ(map_column->get_values_ptr().get(), values_alias.get()); + EXPECT_EQ(map_column->get_offsets_ptr().get(), offsets_alias.get()); +} + +TEST(ColumnMapTest2, ConstFilterAndPermuteKeepInputAliasesUntouched) { + auto keys_mut = ColumnString::create(); + keys_mut->insert_data("a", 1); + keys_mut->insert_data("b", 1); + keys_mut->insert_data("c", 1); + ColumnPtr keys = std::move(keys_mut); + ColumnPtr keys_alias = keys; + + auto values_mut = ColumnInt32::create(); + values_mut->insert_value(1); + values_mut->insert_value(2); + values_mut->insert_value(3); + ColumnPtr values = std::move(values_mut); + ColumnPtr values_alias = values; + + auto offsets_mut = ColumnArray::ColumnOffsets::create(); + offsets_mut->get_data().push_back(2); + offsets_mut->get_data().push_back(3); + ColumnPtr offsets = std::move(offsets_mut); + ColumnPtr offsets_alias = offsets; + + auto map_column = ColumnMap::create(keys, values, offsets); + + IColumn::Filter filter; + filter.push_back(0); + filter.push_back(1); + auto filtered = map_column->filter(filter, 1); + const auto& filtered_map = assert_cast(*filtered); + EXPECT_EQ(filtered_map.size(), 1); + EXPECT_EQ(filtered_map.get_keys().size(), 1); + EXPECT_EQ(assert_cast(filtered_map.get_values()).get_element(0), 3); + + IColumn::Permutation perm; + perm.push_back(1); + perm.push_back(0); + auto permuted = map_column->permute(perm, 0); + const auto& permuted_map = assert_cast(*permuted); + EXPECT_EQ(permuted_map.size(), 2); + EXPECT_EQ(permuted_map.get_offsets()[0], 1); + EXPECT_EQ(permuted_map.get_offsets()[1], 3); + + EXPECT_EQ(keys_alias->size(), 3); + EXPECT_EQ(values_alias->size(), 3); + EXPECT_EQ(offsets_alias->size(), 2); +} + +TEST(ColumnMapTest2, DeduplicateNestedNullableMapValuesDetachesSharedValueColumn) { + auto inner_values = ColumnMap::create(ColumnString::create(), ColumnInt32::create(), + ColumnArray::ColumnOffsets::create()); + Map inner_map; + inner_map.push_back(Field::create_field( + Array {Field::create_field("a"), Field::create_field("a")})); + inner_map.push_back(Field::create_field( + Array {Field::create_field(1), Field::create_field(2)})); + inner_values->insert(Field::create_field(inner_map)); + + ColumnPtr shared_inner_values = std::move(inner_values); + ColumnPtr inner_values_alias = shared_inner_values; + + auto null_map_mut = ColumnUInt8::create(); + null_map_mut->insert_value(0); + ColumnPtr null_map = std::move(null_map_mut); + ColumnPtr nullable_values = ColumnNullable::create(shared_inner_values, null_map); + + auto outer_keys_mut = ColumnString::create(); + outer_keys_mut->insert_data("outer", 5); + ColumnPtr outer_keys = std::move(outer_keys_mut); + + auto outer_offsets_mut = ColumnArray::ColumnOffsets::create(); + outer_offsets_mut->get_data().push_back(1); + ColumnPtr outer_offsets = std::move(outer_offsets_mut); + + auto outer_map = ColumnMap::create(outer_keys, nullable_values, outer_offsets); + auto st = outer_map->deduplicate_keys(true); + ASSERT_TRUE(st.ok()) << st.to_string(); + + const auto& alias_inner_map = assert_cast(*inner_values_alias); + EXPECT_EQ(alias_inner_map.get_keys().size(), 2); + EXPECT_EQ(alias_inner_map.get_values().size(), 2); + + const auto& outer_map_ref = *outer_map; + const auto& outer_values_nullable = + assert_cast(outer_map_ref.get_values()); + const auto& deduplicated_inner_map = + assert_cast(outer_values_nullable.get_nested_column()); + EXPECT_EQ(deduplicated_inner_map.get_keys().size(), 1); + EXPECT_EQ(deduplicated_inner_map.get_values().size(), 1); + EXPECT_EQ(deduplicated_inner_map.get_keys().get_data_at(0).to_string(), "a"); + EXPECT_EQ(assert_cast(deduplicated_inner_map.get_values()).get_element(0), + 2); +} + TEST(ColumnMapTest2, StringValueTest) { auto col_map_str64 = ColumnMap(ColumnInt64::create(), ColumnString64::create(), ColumnArray::ColumnOffsets::create()); @@ -415,4 +529,4 @@ TEST(ColumnMapTest2, StringValueTest) { EXPECT_EQ(v[i], v3[i]); } }; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/core/block/column_nullable_test.cpp b/be/test/core/block/column_nullable_test.cpp index dc837c335b13d1..0b92d1813fb8fb 100644 --- a/be/test/core/block/column_nullable_test.cpp +++ b/be/test/core/block/column_nullable_test.cpp @@ -44,7 +44,7 @@ TEST(ColumnNullableTest, HashTest) { nullable_column->update_hash_with_value(0, hashes[1]); EXPECT_EQ(hashes[0].get64(), hashes[1].get64()); - auto& null_map = ((ColumnNullable)(*nullable_column)).get_null_map_data(); + auto& null_map = nullable_column->get_null_map_data(); null_map[1] = true; column->update_hash_with_value(1, hashes[0]); nullable_column->update_hash_with_value(1, hashes[1]); diff --git a/be/test/core/block/column_test.cpp b/be/test/core/block/column_test.cpp index d25160d6fafa2f..69af3ca5bc8563 100644 --- a/be/test/core/block/column_test.cpp +++ b/be/test/core/block/column_test.cpp @@ -105,6 +105,25 @@ TEST_F(ColumnTest, CutColumnDecimal64) { EXPECT_THROW({ col_dcm->cut(0, 10); }, doris::Exception); } +TEST_F(ColumnTest, AssumeMutableRequiresExclusiveOwnership) { + ColumnPtr column = ColumnInt64::create(); + { + auto mutable_column = column->assume_mutable(); + assert_cast(mutable_column.get())->insert_value(1); + } + + ColumnPtr alias = column; + EXPECT_THROW({ (void)column->assume_mutable(); }, doris::Exception); + EXPECT_THROW({ (void)column->assume_mutable_ref(); }, doris::Exception); + + auto cloned = IColumn::mutate(std::move(column)); + auto* cloned_int = assert_cast(cloned.get()); + cloned_int->insert_value(2); + + EXPECT_EQ(alias->size(), 1); + EXPECT_EQ(cloned_int->size(), 2); +} + TEST_F(ColumnTest, ShrinkColumnString) { auto shrunk_col = col_str->shrink(2); EXPECT_EQ(shrunk_col->size(), 2); diff --git a/be/test/core/column/column_array_test.cpp b/be/test/core/column/column_array_test.cpp index 7b2ec0a2544ff2..68a175d3c853aa 100644 --- a/be/test/core/column/column_array_test.cpp +++ b/be/test/core/column/column_array_test.cpp @@ -611,6 +611,30 @@ TEST_F(ColumnArrayTest, ShrinkPaddingCharsTest) { } //////////////////////// special function from column_array.h //////////////////////// +TEST_F(ColumnArrayTest, SharedCreateValidatesOffsetsAndDataSize) { + auto data_mut = ColumnInt32::create(); + data_mut->insert_value(1); + data_mut->insert_value(2); + ColumnPtr data = std::move(data_mut); + + EXPECT_ANY_THROW({ auto array_column = ColumnArray::create(data); }); + + auto bad_offsets_mut = ColumnArray::ColumnOffsets::create(); + bad_offsets_mut->get_data().push_back(1); + ColumnPtr bad_offsets = std::move(bad_offsets_mut); + EXPECT_ANY_THROW({ auto array_column = ColumnArray::create(data, bad_offsets); }); + + ColumnPtr wrong_offsets = ColumnUInt8::create(); + EXPECT_ANY_THROW({ auto array_column = ColumnArray::create(data, wrong_offsets); }); + + auto good_offsets_mut = ColumnArray::ColumnOffsets::create(); + good_offsets_mut->get_data().push_back(2); + ColumnPtr good_offsets = std::move(good_offsets_mut); + auto array_column = ColumnArray::create(data, good_offsets); + EXPECT_EQ(array_column->get_data_ptr().get(), data.get()); + EXPECT_EQ(array_column->get_offsets_ptr().get(), good_offsets.get()); +} + TEST_F(ColumnArrayTest, CreateArrayTest) { // Test ColumnArray constructor constraints: nested_column and offsets_column must not be ColumnConst. // The constructor enforces this via check_const_only_in_top_level(), preventing COW-related issues: @@ -688,8 +712,7 @@ TEST_F(ColumnArrayTest, ConvertIfOverflowAndInsertTest) { // check ptr is itself auto ptr = column->convert_column_if_overflow(); EXPECT_EQ(ptr.get(), column.get()); - auto arr_col = - check_and_get_column(remove_nullable(column->assume_mutable()).get()); + auto arr_col = check_and_get_column(remove_nullable(column->get_ptr()).get()); auto nested_col = arr_col->get_data_ptr(); auto array_col1 = check_and_get_column(remove_nullable(ptr).get()); auto nested_col1 = array_col1->get_data_ptr(); diff --git a/be/test/core/column/column_ip_test.cpp b/be/test/core/column/column_ip_test.cpp index fc03446e45503b..05cf6034ed37e5 100644 --- a/be/test/core/column/column_ip_test.cpp +++ b/be/test/core/column/column_ip_test.cpp @@ -77,32 +77,32 @@ class ColumnIPTest : public CommonColumnTest { TEST_F(ColumnIPTest, InsertRangeFromTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_insert_range_from_callback); } TEST_F(ColumnIPTest, InsertManyFromTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_insert_many_from_callback); } TEST_F(ColumnIPTest, InsertIndicesFromTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_insert_indices_from_callback); } TEST_F(ColumnIPTest, InsertDefaultTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); // ipv4 default value is '0.0.0.0' and ipv6 default value is '::' check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_insert_default_callback); } @@ -110,31 +110,31 @@ TEST_F(ColumnIPTest, InsertDefaultTest) { TEST_F(ColumnIPTest, InsertManyDefaultsTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_insert_many_defaults_callback); } TEST_F(ColumnIPTest, GetDataAtTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_get_data_at_callback); } TEST_F(ColumnIPTest, FieldTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_field_callback); } TEST_F(ColumnIPTest, GetRawDataTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, {serde[1]}, ';', {2}, data_files[0], assert_get_raw_data_callback); } @@ -142,8 +142,8 @@ TEST_F(ColumnIPTest, GetRawDataTest) { TEST_F(ColumnIPTest, SerDeVecTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); load_data_from_csv(serde, ip_cols, data_files[0], ';', {1, 2}); ser_deser_vec(ip_cols, {dt_ipv4, dt_ipv6}); } @@ -151,8 +151,8 @@ TEST_F(ColumnIPTest, SerDeVecTest) { TEST_F(ColumnIPTest, serDeserializeWithArenaImpl) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); load_data_from_csv(serde, ip_cols, data_files[0], ';', {1, 2}); ser_deserialize_with_arena_impl(ip_cols, {dt_ipv4, dt_ipv6}); @@ -161,16 +161,16 @@ TEST_F(ColumnIPTest, serDeserializeWithArenaImpl) { TEST_F(ColumnIPTest, SizeTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_size_callback); } TEST_F(ColumnIPTest, ByteSizeTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_byte_size_callback); } @@ -178,8 +178,8 @@ TEST_F(ColumnIPTest, ByteSizeTest) { TEST_F(ColumnIPTest, AllocateBytesTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_allocated_bytes_callback); } @@ -187,8 +187,8 @@ TEST_F(ColumnIPTest, AllocateBytesTest) { TEST_F(ColumnIPTest, PopbackTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_pop_back_callback); } @@ -197,18 +197,18 @@ TEST_F(ColumnIPTest, CloneTest) { // we test the column with clone_resize, clone_empty for assert size and ptr // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); load_data_from_csv(serde, ip_cols, data_files[0], ';', {1, 2}); - assert_clone_empty(column_ipv4->assume_mutable_ref()); - assert_clone_empty(column_ipv6->assume_mutable_ref()); + assert_clone_empty(ip_cols[0]->assume_mutable_ref()); + assert_clone_empty(ip_cols[1]->assume_mutable_ref()); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_clone_resized_callback); } TEST_F(ColumnIPTest, CutTest) { MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); load_data_from_csv(serde, ip_cols, data_files[0], ';', {1, 2}); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_cut_callback); } @@ -216,24 +216,24 @@ TEST_F(ColumnIPTest, CutTest) { TEST_F(ColumnIPTest, ResizeTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_resize_callback); } TEST_F(ColumnIPTest, ReserveTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_reserve_callback); } TEST_F(ColumnIPTest, ReplaceColumnTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); load_data_from_csv(serde, ip_cols, data_files[0], ';', {1, 2}); // replace_column_data @@ -246,26 +246,26 @@ TEST_F(ColumnIPTest, ReplaceColumnTest) { TEST_F(ColumnIPTest, AppendDataBySelectorTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_append_data_by_selector_callback); } TEST_F(ColumnIPTest, PermutationAndSortTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); load_data_from_csv(serde, ip_cols, data_files[1], ';', {1, 2}); - assert_column_permutations(column_ipv4->assume_mutable_ref(), dt_ipv4); - assert_column_permutations(column_ipv6->assume_mutable_ref(), dt_ipv6); + assert_column_permutations(ip_cols[0]->assume_mutable_ref(), dt_ipv4); + assert_column_permutations(ip_cols[1]->assume_mutable_ref(), dt_ipv6); } TEST_F(ColumnIPTest, FilterTest) { // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); check_data(ip_cols, serde, ';', {1, 2}, data_files[0], assert_filter_callback); } @@ -274,8 +274,8 @@ TEST_F(ColumnIPTest, HashTest) { // XXHash // insert from data csv and assert insert result MutableColumns ip_cols; - ip_cols.push_back(column_ipv4->get_ptr()); - ip_cols.push_back(column_ipv6->get_ptr()); + ip_cols.push_back(std::move(column_ipv4)); + ip_cols.push_back(std::move(column_ipv6)); load_data_from_csv(serde, ip_cols, data_files[0], ';', {1, 2}); // update_hashes_with_value diff --git a/be/test/core/column/column_nullable_test.cpp b/be/test/core/column/column_nullable_test.cpp index 3d3b45e218bdef..64ca053c50362f 100644 --- a/be/test/core/column/column_nullable_test.cpp +++ b/be/test/core/column/column_nullable_test.cpp @@ -103,6 +103,25 @@ TEST(ColumnNullableTest, PredicateTest) { EXPECT_TRUE(null_dst->has_null()); } +TEST(ColumnNullableTest, SharedCreatePreservesImmutableSubcolumns) { + auto nested_mut = ColumnInt64::create(); + nested_mut->insert_value(10); + ColumnPtr nested = std::move(nested_mut); + ColumnPtr nested_alias = nested; + + auto null_map_mut = ColumnUInt8::create(); + null_map_mut->insert_value(0); + ColumnPtr null_map = std::move(null_map_mut); + ColumnPtr null_map_alias = null_map; + + auto nullable = ColumnNullable::create(nested, null_map); + const auto& nullable_ref = *nullable; + EXPECT_EQ(nullable_ref.get_nested_column_ptr().get(), nested_alias.get()); + EXPECT_EQ(nullable_ref.get_null_map_column_ptr().get(), null_map_alias.get()); + EXPECT_EQ(nested_alias->size(), 1); + EXPECT_EQ(null_map_alias->size(), 1); +} + TEST(ColumnNullableTest, append_data_by_selector) { auto srt_column = ColumnHelper::create_nullable_column( {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, @@ -206,4 +225,4 @@ TEST(ColumnNullableTest, ScalaTypeNullStringTest2erase) { } } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/core/column/column_variant_test.cpp b/be/test/core/column/column_variant_test.cpp index ebf59b77345d14..71f007a73c1615 100644 --- a/be/test/core/column/column_variant_test.cpp +++ b/be/test/core/column/column_variant_test.cpp @@ -1178,9 +1178,11 @@ TEST_F(ColumnVariantTest, field_test) { ColumnVariant::MutablePtr obj; obj = ColumnVariant::create(1, false); MutableColumns cols; - cols.push_back(obj->get_ptr()); + cols.push_back(std::move(obj)); const auto& json_file_obj = test_data_dir_json + "json_variant/object_boundary.jsonl"; load_columns_data_from_file(cols, serde, '\n', {0}, json_file_obj); + obj = ColumnVariant::cast_to_column_mutptr(assert_cast(cols[0].get())); + cols.clear(); EXPECT_TRUE(!obj->empty()); test_func(obj); } @@ -2122,13 +2124,16 @@ TEST_F(ColumnVariantTest, fill_path_column_from_sparse_data) { ColumnVariant::MutablePtr obj; obj = ColumnVariant::create(1, false); MutableColumns cols; - cols.push_back(obj->get_ptr()); + cols.push_back(std::move(obj)); const auto& json_file_obj = test_data_dir_json + "json_variant/object_boundary.jsonl"; load_columns_data_from_file(cols, serde, '\n', {0}, json_file_obj); + obj = ColumnVariant::cast_to_column_mutptr(assert_cast(cols[0].get())); + cols.clear(); EXPECT_TRUE(!obj->empty()); auto sparse_col = obj->get_sparse_column(); auto cloned_sparse = sparse_col->clone_empty(); - auto& offsets = obj->serialized_sparse_column_offsets(); + const auto& offsets = + static_cast(*obj).serialized_sparse_column_offsets(); for (size_t i = 0; i != offsets.size(); ++i) { auto start = offsets[i - 1]; auto end = offsets[i]; diff --git a/be/test/core/column/common_column_test.h b/be/test/core/column/common_column_test.h index ac4ed5eff76582..fe0ecf051d0140 100644 --- a/be/test/core/column/common_column_test.h +++ b/be/test/core/column/common_column_test.h @@ -634,11 +634,15 @@ class CommonColumnTest : public ::testing::Test { Block block; for (size_t i = 0; i < load_cols.size(); ++i) { ColumnWithTypeAndName columnTypeAndName; - columnTypeAndName.column = load_cols[i]->assume_mutable(); + columnTypeAndName.column = load_cols[i]->get_ptr(); columnTypeAndName.type = types[i]; block.insert(columnTypeAndName); } MutableBlock mb = MutableBlock::build_mutable_block(&block); + // Rebuild block from load_cols after build_mutable_block stole the column pointers + for (size_t i = 0; i < load_cols.size(); ++i) { + block.get_by_position(i).column = load_cols[i]->get_ptr(); + } // step2. to construct a block for assert_cols Block assert_block; Block empty_block; @@ -691,7 +695,9 @@ class CommonColumnTest : public ::testing::Test { continue; } else if (*pos + *cl > source_column->size()) { if (is_column( - remove_nullable(source_column->assume_mutable()).get())) { + remove_nullable(static_cast(source_column.get()) + ->get_ptr()) + .get())) { // insert_range_from in array has DCHECK_LG continue; } @@ -3544,13 +3550,13 @@ auto assert_column_vector_serialize_vec_callback = [](auto x, if (test_null_map) { cloned_target_column->serialize(input_keys.data(), rows); deser_column_wrapper = cloned_target_column->clone_empty(); - deser_column = ((ColumnNullable*)deser_column_wrapper.get())->get_nested_column_ptr(); } else { target_column->serialize(input_keys.data(), rows); deser_column = source_column->clone_empty(); } if (test_null_map) { deser_column_wrapper->deserialize(input_keys.data(), rows); + deser_column = ((ColumnNullable*)deser_column_wrapper.get())->get_nested_column_ptr(); } else { deser_column->deserialize(input_keys.data(), rows); } diff --git a/be/test/core/data_type/common_data_type_serder_test.h b/be/test/core/data_type/common_data_type_serder_test.h index d968cc1213e92d..a7393b9d8eee0c 100644 --- a/be/test/core/data_type/common_data_type_serder_test.h +++ b/be/test/core/data_type/common_data_type_serder_test.h @@ -277,7 +277,7 @@ class CommonDataTypeSerdeTest : public ::testing::Test { jsonb_column->reserve(load_cols[0]->size()); MutableColumns assert_cols; for (size_t i = 0; i < load_cols.size(); ++i) { - assert_cols.push_back(load_cols[i]->assume_mutable()); + assert_cols.push_back(load_cols[i]->clone_empty()); } DataTypeSerDe::FormatOptions options; auto tz = cctz::utc_time_zone(); diff --git a/be/test/core/data_type/complex_type_test.cpp b/be/test/core/data_type/complex_type_test.cpp index 54dc360e2a8fa1..50e89ce4daf208 100644 --- a/be/test/core/data_type/complex_type_test.cpp +++ b/be/test/core/data_type/complex_type_test.cpp @@ -20,8 +20,16 @@ #include #include +#include +#include "agent/be_exec_version_manager.h" +#include "core/assert_cast.h" #include "core/column/column.h" +#include "core/column/column_array.h" +#include "core/column/column_map.h" +#include "core/column/column_string.h" +#include "core/column/column_struct.h" +#include "core/column/column_vector.h" #include "core/data_type/data_type.h" #include "core/data_type/data_type_array.h" #include "core/data_type/data_type_map.h" @@ -34,6 +42,25 @@ namespace doris { +namespace { + +std::vector serialize_column(const DataTypePtr& type, const ColumnPtr& column) { + const int be_exec_version = BeExecVersionManager::get_newest_version(); + std::vector buf(type->get_uncompressed_serialized_bytes(*column, be_exec_version)); + char* end = type->serialize(*column, buf.data(), be_exec_version); + EXPECT_EQ(buf.data() + buf.size(), end); + return buf; +} + +void deserialize_column(const DataTypePtr& type, const std::vector& buf, + MutableColumnPtr* column) { + const int be_exec_version = BeExecVersionManager::get_newest_version(); + const char* end = type->deserialize(buf.data(), column, be_exec_version); + EXPECT_EQ(buf.data() + buf.size(), end); +} + +} // namespace + TEST(ComplexTypeTest, CreateColumnConstWithDefaultValue) { DataTypePtr n1 = std::make_shared(std::make_shared()); DataTypePtr n2 = std::make_shared(std::make_shared()); @@ -70,4 +97,92 @@ TEST(ComplexTypeTest, CreateColumnConstWithDefaultValue) { col_a->get(0, af); EXPECT_EQ(PrimitiveType::TYPE_ARRAY, af.get_type()); } + +TEST(ComplexTypeTest, DeserializeArrayWritesBackSharedNestedColumn) { + DataTypePtr nested_type = std::make_shared(); + DataTypePtr array_type = std::make_shared(nested_type); + + auto src_column = array_type->create_column(); + src_column->insert(Field::create_field( + Array {Field::create_field(1), Field::create_field(2)})); + src_column->insert(Field::create_field(Array {Field::create_field(3)})); + auto buf = serialize_column(array_type, src_column->get_ptr()); + + ColumnPtr shared_nested_column = ColumnInt32::create(); + MutableColumnPtr dst_column = ColumnArray::create(shared_nested_column); + deserialize_column(array_type, buf, &dst_column); + + const auto& array_column = assert_cast(*dst_column); + EXPECT_EQ(2, array_column.size()); + EXPECT_EQ(0, shared_nested_column->size()); + EXPECT_EQ(3, array_column.get_data().size()); + EXPECT_EQ(2, array_column.get_offsets()[0]); + EXPECT_EQ(3, array_column.get_offsets()[1]); + + const auto& data = assert_cast(array_column.get_data()).get_data(); + EXPECT_EQ(1, data[0]); + EXPECT_EQ(2, data[1]); + EXPECT_EQ(3, data[2]); +} + +TEST(ComplexTypeTest, DeserializeMapWritesBackSharedKeyAndValueColumns) { + DataTypePtr key_type = std::make_shared(); + DataTypePtr value_type = std::make_shared(); + DataTypePtr map_type = std::make_shared(key_type, value_type); + + auto src_column = map_type->create_column(); + Map map; + map.push_back(Field::create_field( + Array {Field::create_field(10), Field::create_field(20)})); + map.push_back(Field::create_field( + Array {Field::create_field("a"), Field::create_field("b")})); + src_column->insert(Field::create_field(map)); + auto buf = serialize_column(map_type, src_column->get_ptr()); + + ColumnPtr shared_keys_column = ColumnInt32::create(); + ColumnPtr shared_values_column = ColumnString::create(); + ColumnPtr offsets_column = ColumnArray::ColumnOffsets::create(); + MutableColumnPtr dst_column = + ColumnMap::create(shared_keys_column, shared_values_column, offsets_column); + deserialize_column(map_type, buf, &dst_column); + + const auto& map_column = assert_cast(*dst_column); + EXPECT_EQ(1, map_column.size()); + EXPECT_EQ(0, shared_keys_column->size()); + EXPECT_EQ(0, shared_values_column->size()); + EXPECT_EQ(2, map_column.get_keys().size()); + EXPECT_EQ(2, map_column.get_values().size()); + + const auto& keys = assert_cast(map_column.get_keys()).get_data(); + EXPECT_EQ(10, keys[0]); + EXPECT_EQ(20, keys[1]); + EXPECT_EQ("a", map_column.get_values().get_data_at(0).to_string()); + EXPECT_EQ("b", map_column.get_values().get_data_at(1).to_string()); +} + +TEST(ComplexTypeTest, DeserializeStructWritesBackSharedChildren) { + DataTypes children_types {std::make_shared(), + std::make_shared()}; + DataTypePtr struct_type = std::make_shared(children_types); + + auto src_column = struct_type->create_column(); + src_column->insert(Field::create_field( + Struct {Field::create_field(7), Field::create_field("seven")})); + auto buf = serialize_column(struct_type, src_column->get_ptr()); + + ColumnPtr shared_int_column = ColumnInt32::create(); + ColumnPtr shared_string_column = ColumnString::create(); + Columns shared_columns {shared_int_column, shared_string_column}; + MutableColumnPtr dst_column = ColumnStruct::create(shared_columns); + deserialize_column(struct_type, buf, &dst_column); + + const auto& struct_column = assert_cast(*dst_column); + EXPECT_EQ(1, struct_column.size()); + EXPECT_EQ(0, shared_int_column->size()); + EXPECT_EQ(0, shared_string_column->size()); + + const auto& ints = assert_cast(struct_column.get_column(0)).get_data(); + EXPECT_EQ(7, ints[0]); + EXPECT_EQ("seven", struct_column.get_column(1).get_data_at(0).to_string()); +} } // namespace doris diff --git a/be/test/core/data_type_serde/data_type_serde_csv_test.cpp b/be/test/core/data_type_serde/data_type_serde_csv_test.cpp index 0478507cab0844..84bce05751a061 100644 --- a/be/test/core/data_type_serde/data_type_serde_csv_test.cpp +++ b/be/test/core/data_type_serde/data_type_serde_csv_test.cpp @@ -512,8 +512,9 @@ TEST(CsvSerde, ComplexTypeSerdeSchemaChangedCsvTest) { DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, formatOptions); EXPECT_EQ(st, Status::OK()); - auto struct_col = static_cast( - static_cast(*col.get()).get_nested_column()); + // Use const access for read-only assertions: avoids assume_mutable_ref() on sub-columns. + const auto& struct_col = static_cast( + static_cast(*col.get()).get_nested_column()); EXPECT_EQ(struct_col.get_column(0).get_data_at(0).to_string(), "false"); EXPECT_EQ(struct_col.get_column(1).get_data_at(0).to_string(), "example"); @@ -537,11 +538,11 @@ TEST(CsvSerde, ComplexTypeSerdeSchemaChangedCsvTest) { DataTypeSerDeSPtr serde = data_type_ptr->get_serde(); Status st = serde->deserialize_one_cell_from_hive_text(*col, slice, formatOptions); EXPECT_EQ(st, Status::OK()); - auto array_col = static_cast( - static_cast(*col.get()).get_nested_column()); + const auto& array_col = static_cast( + static_cast(*col.get()).get_nested_column()); - auto string_col = static_cast( - static_cast(array_col.get_data()).get_nested_column()); + const auto& string_col = static_cast( + static_cast(array_col.get_data()).get_nested_column()); EXPECT_EQ(string_col.get_data_at(0).to_string(), "1\003example"); EXPECT_EQ(string_col.get_data_at(1).to_string(), "2\003test"); } diff --git a/be/test/core/data_type_serde/data_type_serde_struct_test.cpp b/be/test/core/data_type_serde/data_type_serde_struct_test.cpp index e583b50e4302f2..5158ab01c75f12 100644 --- a/be/test/core/data_type_serde/data_type_serde_struct_test.cpp +++ b/be/test/core/data_type_serde/data_type_serde_struct_test.cpp @@ -144,10 +144,9 @@ TEST_F(DataTypeStructSerDeTest, ArrowMemNotAligned) { EXPECT_EQ(string_values_address % 4, 1); // 5.Test read_column_from_arrow - std::vector vector_columns; - vector_columns.emplace_back(ColumnInt32::create()); - vector_columns.emplace_back(ColumnString::create()); - auto ser_col = ColumnStruct::create(vector_columns); + // Create sub-columns exclusively (no extra refs) so that ColumnStruct::get_column() + // non-const path does not find use_count > 1. + auto ser_col = ColumnStruct::create(Columns {ColumnInt32::create(), ColumnString::create()}); cctz::time_zone tz; DataTypeSerDeSPtrs elem_serdes = {serde_int32, serde_str}; Strings field_names = {"int_field", "string_field"}; diff --git a/be/test/core/jsonb/serialize_test.cpp b/be/test/core/jsonb/serialize_test.cpp index 2419383b0eddb3..cc721cc618d044 100644 --- a/be/test/core/jsonb/serialize_test.cpp +++ b/be/test/core/jsonb/serialize_test.cpp @@ -35,6 +35,7 @@ #include "agent/be_exec_version_manager.h" #include "common/exception.h" +#include "core/assert_cast.h" #include "core/block/block.h" #include "core/block/column_with_type_and_name.h" #include "core/column/column.h" @@ -621,6 +622,45 @@ static void fill_block_with_array_string(Block& block) { block.insert(test_array_string); } +TEST(BlockSerializeCowTest, JsonbToBlockMutatesDestinationOwnerColumn) { + TabletSchema schema; + TabletColumn c1; + c1.set_name("k1"); + c1.set_unique_id(1); + c1.set_type(FieldType::OLAP_FIELD_TYPE_INT); + schema.append_column(c1); + + auto src_column = ColumnInt32::create(); + src_column->insert_value(10); + src_column->insert_value(20); + auto int_type = std::make_shared(); + Block src_block; + src_block.insert({std::move(src_column), int_type, "k1"}); + + auto jsonb_column = ColumnString::create(); + auto serdes = create_data_type_serdes(src_block.get_data_types()); + JsonbSerializeUtil::block_to_jsonb(schema, src_block, *jsonb_column, src_block.columns(), + serdes, {}); + + ColumnPtr shared_column = ColumnInt32::create(); + const auto* original_column = shared_column.get(); + Block dst_block; + dst_block.insert({shared_column, int_type, "k1"}); + + std::unordered_map col_uid_to_idx {{1, 0}}; + std::vector default_values(1); + THROW_IF_ERROR(JsonbSerializeUtil::jsonb_to_block(serdes, *jsonb_column, col_uid_to_idx, + dst_block, default_values, {})); + + EXPECT_NE(dst_block.get_by_position(0).column.get(), original_column); + EXPECT_EQ(shared_column->size(), 0); + EXPECT_EQ(dst_block.rows(), 2); + EXPECT_EQ(assert_cast(*dst_block.get_by_position(0).column).get_data()[0], + 10); + EXPECT_EQ(assert_cast(*dst_block.get_by_position(0).column).get_data()[1], + 20); +} + TEST(BlockSerializeTest, Array) { TabletSchema schema; TabletColumn c1; diff --git a/be/test/exec/column_type_convert_test.cpp b/be/test/exec/column_type_convert_test.cpp index 5178cddbd59d2f..f336a245568cbd 100644 --- a/be/test/exec/column_type_convert_test.cpp +++ b/be/test/exec/column_type_convert_test.cpp @@ -63,8 +63,7 @@ TEST_F(ColumnTypeConverterTest, TestIntegerWideningConversions) { src_data.push_back(std::numeric_limits::max()); src_data.push_back(std::numeric_limits::min()); - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -96,8 +95,7 @@ TEST_F(ColumnTypeConverterTest, TestIntegerWideningConversions) { src_data.push_back(std::numeric_limits::max()); src_data.push_back(std::numeric_limits::min()); - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -130,8 +128,7 @@ TEST_F(ColumnTypeConverterTest, TestIntegerNarrowingConversions) { src_data.push_back(std::numeric_limits::max()); src_data.push_back(std::numeric_limits::min()); - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -160,8 +157,7 @@ TEST_F(ColumnTypeConverterTest, TestIntegerNarrowingConversions) { src_data.push_back(std::numeric_limits::max() + 1); src_data.push_back(std::numeric_limits::min() - 1); - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(!st.ok()); @@ -189,8 +185,7 @@ TEST_F(ColumnTypeConverterTest, TestFloatingPointConversions) { src_data.push_back((1L << 23) - 1); src_data.push_back(1L << 23); src_data.push_back((1L << 23) + 1); - auto dst_nullable_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_nullable_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); auto& null_map = nullable_col.get_null_map_data(); @@ -232,8 +227,7 @@ TEST_F(ColumnTypeConverterTest, TestFloatingPointConversions) { src_col->insert_data("invalid", 7); // Invalid string src_col->insert_data("", 0); // Empty string - auto dst_nullable_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_nullable_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); @@ -289,8 +283,7 @@ TEST_F(ColumnTypeConverterTest, TestFloatingPointConversions) { src_data.push_back(-std::numeric_limits::infinity()); src_data.push_back(std::numeric_limits::quiet_NaN()); - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -325,8 +318,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(Decimal32(12345)); // 123.45 src_data.push_back(Decimal32(-12345)); // -123.45 - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -354,8 +346,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(Decimal32(12345)); // 123.45 src_data.push_back(Decimal32(-67890)); // -678.90 - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -385,8 +376,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(Decimal64(12345678901234)); // Normal value: 1234567890.1234 src_data.push_back(Decimal64(-98765432109876)); // Negative value: -9876543210.9876 - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); // Perform conversion Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); @@ -419,8 +409,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(Decimal32(-12345)); // -123.45 src_data.push_back(Decimal32(23345)); // Too large 233.45 - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); auto& null_map = nullable_col.get_null_map_data(); @@ -458,8 +447,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(Decimal128V3(-102345)); src_data.push_back(Decimal128V3(203345)); - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); auto& null_map = nullable_col.get_null_map_data(); @@ -499,8 +487,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(Decimal256(655363345)); src_data.push_back(Decimal256(3333333333332345)); - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); auto& null_map = nullable_col.get_null_map_data(); @@ -539,8 +526,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(-67890); // -678.90 after scaling src_data.push_back(0); // Zero check - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -569,8 +555,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(-67890); src_data.push_back(0); - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -598,8 +583,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(-123); // -678.90 after scaling src_data.push_back(0); // Zero check - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -628,8 +612,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(-123); // -678.90 after scaling src_data.push_back(0); // Zero check - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& null_map = nullable_col.get_null_map_data(); @@ -667,8 +650,7 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(Decimal64(-999999999)); // Edge case: negative max for Decimal32 src_data.push_back(Decimal64(-1000000000)); // Out of range (underflow) - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& null_map = nullable_col.get_null_map_data(); @@ -698,9 +680,8 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(Decimal64(999999999)); // Edge case: max for Decimal32 src_data.push_back(Decimal64(-999999999)); // Edge case: negative max for Decimal32 ASSERT_EQ(3, src_data.size()); - auto dst_col = nullable_dst_type->create_column(); - dst_col->resize(0); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); + mutable_dst->resize(0); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); @@ -743,9 +724,8 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_data.push_back(std::numeric_limits::infinity()); // Infinity src_data.push_back(std::numeric_limits::quiet_NaN()); // NaN - auto dst_col = nullable_dst_type->create_column(); - dst_col->resize(0); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); + mutable_dst->resize(0); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); @@ -791,9 +771,8 @@ TEST_F(ColumnTypeConverterTest, TestDecimalConversions) { src_col->insert_data("0.0", 3); // Zero value src_col->insert_data("9999999999.99", 13); // Edge case: max valid value within precision - auto dst_col = nullable_dst_type->create_column(); - dst_col->resize(0); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); + mutable_dst->resize(0); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); @@ -837,8 +816,7 @@ TEST_F(ColumnTypeConverterTest, TestStringConversions) { src_data.push_back(std::numeric_limits::min()); src_data.push_back(0); - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -869,8 +847,7 @@ TEST_F(ColumnTypeConverterTest, TestStringConversions) { src_data.push_back(std::numeric_limits::infinity()); src_data.push_back(std::numeric_limits::quiet_NaN()); - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -901,8 +878,7 @@ TEST_F(ColumnTypeConverterTest, TestStringConversions) { src_col->insert_data("not a number", 11); src_col->insert_data("2147483648", 10); // Greater than INT32_MAX - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); auto& null_map = nullable_col.get_null_map_data(); @@ -935,9 +911,8 @@ TEST_F(ColumnTypeConverterTest, TestStringConversions) { src_data.push_back(Decimal32(-67890)); // -678.90 src_data.push_back(Decimal32(0)); // Zero - auto dst_col = dst_type->create_column(); - dst_col->resize(0); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); + mutable_dst->resize(0); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -972,9 +947,8 @@ TEST_F(ColumnTypeConverterTest, TestStringConversions) { value.unchecked_set_time(2070, 1, 1, 0, 0, 0); src_data.push_back(*reinterpret_cast(&value)); // "2070-01-01" in days format - auto dst_col = dst_type->create_column(); - dst_col->resize(0); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); + mutable_dst->resize(0); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -1005,8 +979,7 @@ TEST_F(ColumnTypeConverterTest, TestStringConversions) { src_data.push_back(1); // true src_data.push_back(0); // false - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -1041,8 +1014,7 @@ TEST_F(ColumnTypeConverterTest, TestStringConversions) { src_col->insert_data("1.5", 3); // Hive: null (not an integer) src_col->insert_data("", 0); // Hive: null - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast( @@ -1090,8 +1062,7 @@ TEST_F(ColumnTypeConverterTest, TestStringToIntegerTypes) { src_col->insert_data("abc", 3); // Invalid - should be NULL src_col->insert_data("", 0); // Empty - should be NULL - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); @@ -1137,8 +1108,7 @@ TEST_F(ColumnTypeConverterTest, TestStringToIntegerTypes) { src_col->insert_data("-32769", 6); // Underflow - should be NULL src_col->insert_data("123.45", 6); // Decimal - should be NULL - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); @@ -1182,8 +1152,7 @@ TEST_F(ColumnTypeConverterTest, TestStringToIntegerTypes) { src_col->insert_data("1000000", 7); // Million src_col->insert_data("2147483648", 10); // Overflow - should be NULL - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); @@ -1227,8 +1196,7 @@ TEST_F(ColumnTypeConverterTest, TestStringToIntegerTypes) { src_col->insert_data("9223372036854775808", 19); // Overflow - should be NULL src_col->insert_data("123abc", 6); // Invalid - should be NULL - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); @@ -1270,8 +1238,7 @@ TEST_F(ColumnTypeConverterTest, TestStringToIntegerTypes) { src_col->insert_data("0", 1); // Zero src_col->insert_data("123e45", 6); // Scientific notation - should be NULL - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& nested_col = static_cast(nullable_col.get_nested_column()); @@ -1460,8 +1427,7 @@ TEST_F(ColumnTypeConverterTest, TestDateTimeV2ToNumericConversions) { // 2024-01-01 00:00:00.123456 auto src_col = make_datetimev2_col({{2024, 1, 1, 0, 0, 0, 123456}}); - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); @@ -1484,8 +1450,7 @@ TEST_F(ColumnTypeConverterTest, TestDateTimeV2ToNumericConversions) { // 1970-01-01 00:00:00.000000 // 3000-01-01 00:00:00.000000 auto src_col = make_datetimev2_col({{1970, 1, 1, 0, 0, 0, 0}, {3000, 1, 1, 0, 0, 0, 0}}); - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& null_map = nullable_col.get_null_map_data(); null_map.resize_fill(src_col->size(), 0); @@ -1512,8 +1477,7 @@ TEST_F(ColumnTypeConverterTest, TestDateTimeV2ToNumericConversions) { // 3000-01-01 00:00:00.000000(会溢出int32) auto src_col = make_datetimev2_col({{3000, 1, 1, 0, 0, 0, 0}}); - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_FALSE(st.ok()); @@ -1545,8 +1509,7 @@ TEST_F(ColumnTypeConverterTest, TestDateTimeV2ToNumericConversions) { src_col->get_data().push_back(parse_datetimev2_str("2022-05-01 13:00:00")); src_col->get_data().push_back(parse_datetimev2_str("2022-05-01 14:00:00")); - auto dst_col = nullable_dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = nullable_dst_type->create_column(); auto& nullable_col = static_cast(*mutable_dst); auto& null_map = nullable_col.get_null_map_data(); null_map.resize_fill(src_col->size(), 0); @@ -1715,10 +1678,9 @@ TEST_F(ColumnTypeConverterTest, TestEmptyColumnConversions) { ASSERT_FALSE(converter->is_consistent()); auto src_col = ColumnInt32::create(); // Empty column (no data) - auto dst_col = dst_type->create_column(); - auto mutable_dst = dst_col->assume_mutable(); + auto mutable_dst = dst_type->create_column(); src_col->resize(0); - dst_col->resize(0); + mutable_dst->resize(0); // Perform conversion Status st = converter->convert(reinterpret_cast(src_col), mutable_dst); ASSERT_TRUE(st.ok()); diff --git a/be/test/exec/common/schema_util_rowset_test.cpp b/be/test/exec/common/schema_util_rowset_test.cpp index cf99c9824956c5..18bc77ccb3f883 100644 --- a/be/test/exec/common/schema_util_rowset_test.cpp +++ b/be/test/exec/common/schema_util_rowset_test.cpp @@ -172,6 +172,7 @@ static void fill_block_with_test_data(Block* block, int size) { auto v4 = Field::create_field(i); columns[4]->insert(v4); } + block->set_columns(std::move(columns)); } static int64_t inc_id = 1000; static RowsetWriterContext rowset_writer_context(const std::unique_ptr& data_dir, diff --git a/be/test/exec/common/schema_util_test.cpp b/be/test/exec/common/schema_util_test.cpp index 273a7238fe8177..0416311bd0c2a8 100644 --- a/be/test/exec/common/schema_util_test.cpp +++ b/be/test/exec/common/schema_util_test.cpp @@ -817,7 +817,8 @@ TEST_F(SchemaUtilTest, TestCastColumnEdgeCases) { // Test casting from variant to variant auto variant_column = ColumnVariant::create(10, false); - variant_column->create_root(nullable_array_type, nullable_array_column->assume_mutable()); + // nullable_array_column is also stored in array_col.column (use_count=2), so mutate() clones it. + variant_column->create_root(nullable_array_type, IColumn::mutate(nullable_array_column)); ColumnWithTypeAndName variant_col; variant_col.type = variant_type; @@ -1947,14 +1948,14 @@ TEST_F(SchemaUtilTest, parse_and_materialize_variant_columns_ambiguous_paths) { // Prepare the variant column with the string column as root ColumnVariant::Subcolumns dynamic_subcolumns; dynamic_subcolumns.create_root( - ColumnVariant::Subcolumn(string_col->assume_mutable(), string_type, true)); + ColumnVariant::Subcolumn(std::move(string_col), string_type, true)); auto variant_col = ColumnVariant::create(0, false, std::move(dynamic_subcolumns)); auto variant_type = std::make_shared(); // Construct the block Block block; - block.insert(ColumnWithTypeAndName(variant_col->assume_mutable(), variant_type, "v")); + block.insert(ColumnWithTypeAndName(std::move(variant_col), variant_type, "v")); // The variant column is at index 0 std::vector variant_pos = {0}; diff --git a/be/test/exec/connector/vjdbc_connector_test.cpp b/be/test/exec/connector/vjdbc_connector_test.cpp index 16ff8689aafaf2..5ec3fb7046a5a9 100644 --- a/be/test/exec/connector/vjdbc_connector_test.cpp +++ b/be/test/exec/connector/vjdbc_connector_test.cpp @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include @@ -33,26 +34,55 @@ class JdbcUtilsTest : public ::testing::Test { void SetUp() override { // Save original config and environment original_jdbc_drivers_dir_ = config::jdbc_drivers_dir; - original_doris_home_ = getenv("DORIS_HOME"); + const char* original_doris_home = getenv("DORIS_HOME"); + if (original_doris_home != nullptr) { + original_doris_home_ = original_doris_home; + has_original_doris_home_ = true; + } // Set DORIS_HOME for testing - setenv("DORIS_HOME", "/tmp/test_doris", 1); + temp_home_ = std::filesystem::temp_directory_path() / + ("doris_jdbc_utils_test_" + std::to_string(::getpid())); + second_temp_home_ = std::filesystem::temp_directory_path() / + ("doris_jdbc_utils_test_second_" + std::to_string(::getpid())); + std::filesystem::remove_all(temp_home_); + std::filesystem::remove_all(second_temp_home_); + std::filesystem::create_directories(temp_home_); + setenv("DORIS_HOME", temp_home_.c_str(), 1); } void TearDown() override { // Restore original config and environment config::jdbc_drivers_dir = original_jdbc_drivers_dir_; - if (original_doris_home_) { - setenv("DORIS_HOME", original_doris_home_, 1); + if (has_original_doris_home_) { + setenv("DORIS_HOME", original_doris_home_.c_str(), 1); } else { unsetenv("DORIS_HOME"); } + std::filesystem::remove_all(temp_home_); + std::filesystem::remove_all(second_temp_home_); + } + + std::string default_driver_dir() const { + return (temp_home_ / "plugins" / "jdbc_drivers").string(); + } + + std::string old_driver_dir() const { return (temp_home_ / "jdbc_drivers").string(); } + + std::string second_default_driver_dir() const { + return (second_temp_home_ / "plugins" / "jdbc_drivers").string(); + } + + std::string second_old_driver_dir() const { + return (second_temp_home_ / "jdbc_drivers").string(); } -private: std::string original_jdbc_drivers_dir_; - const char* original_doris_home_ = nullptr; + std::string original_doris_home_; + bool has_original_doris_home_ = false; + std::filesystem::path temp_home_; + std::filesystem::path second_temp_home_; }; // Test resolve_driver_url with absolute URLs @@ -79,10 +109,10 @@ TEST_F(JdbcUtilsTest, TestResolveDriverUrlWithRelativeUrl) { std::string result_url; // Set config to default value to trigger the default directory logic - config::jdbc_drivers_dir = "/tmp/test_doris/plugins/jdbc_drivers"; + config::jdbc_drivers_dir = default_driver_dir(); // Create the target directory and file for testing - std::string dir = "/tmp/test_doris/plugins/jdbc_drivers"; + std::string dir = default_driver_dir(); std::string file_path = dir + "/mysql-connector.jar"; // Create directory and file @@ -104,10 +134,10 @@ TEST_F(JdbcUtilsTest, TestResolveDriverUrlWithRelativeUrl) { // Test resolve_driver_url with default directory TEST_F(JdbcUtilsTest, TestResolveWithDefaultConfig) { - config::jdbc_drivers_dir = "/tmp/test_doris/plugins/jdbc_drivers"; + config::jdbc_drivers_dir = default_driver_dir(); // Create the target directory and file for testing - std::string dir = "/tmp/test_doris/plugins/jdbc_drivers"; + std::string dir = default_driver_dir(); std::string file_path = dir + "/mysql-connector.jar"; std::filesystem::create_directories(dir); @@ -138,9 +168,9 @@ TEST_F(JdbcUtilsTest, TestResolveWithCustomConfig) { } TEST_F(JdbcUtilsTest, TestDefaultDirectoryFileExistsPath) { - config::jdbc_drivers_dir = "/tmp/test_doris/plugins/jdbc_drivers"; + config::jdbc_drivers_dir = default_driver_dir(); - std::string dir = "/tmp/test_doris/plugins/jdbc_drivers"; + std::string dir = default_driver_dir(); std::string file_path = dir + "/existing-driver.jar"; std::filesystem::create_directories(dir); @@ -160,10 +190,10 @@ TEST_F(JdbcUtilsTest, TestDefaultDirectoryFileExistsPath) { } TEST_F(JdbcUtilsTest, TestFallbackToOldDirectory) { - config::jdbc_drivers_dir = "/tmp/test_doris/plugins/jdbc_drivers"; + config::jdbc_drivers_dir = default_driver_dir(); // Create only the old directory and file (not the new one) - std::string old_dir = "/tmp/test_doris/jdbc_drivers"; + std::string old_dir = old_driver_dir(); std::string file_path = old_dir + "/fallback-driver.jar"; std::filesystem::create_directories(old_dir); @@ -183,10 +213,11 @@ TEST_F(JdbcUtilsTest, TestFallbackToOldDirectory) { } TEST_F(JdbcUtilsTest, TestPathConstruction) { - setenv("DORIS_HOME", "/tmp/test_doris2", 1); - config::jdbc_drivers_dir = "/tmp/test_doris2/plugins/jdbc_drivers"; + std::filesystem::create_directories(second_temp_home_); + setenv("DORIS_HOME", second_temp_home_.c_str(), 1); + config::jdbc_drivers_dir = second_default_driver_dir(); - std::string old_dir = "/tmp/test_doris2/jdbc_drivers"; + std::string old_dir = second_old_driver_dir(); std::string file_path = old_dir + "/test.jar"; std::filesystem::create_directories(old_dir); @@ -223,9 +254,9 @@ TEST_F(JdbcUtilsTest, TestEdgeCases) { } TEST_F(JdbcUtilsTest, TestMultipleCallsConsistency) { - config::jdbc_drivers_dir = "/tmp/test_doris/plugins/jdbc_drivers"; + config::jdbc_drivers_dir = default_driver_dir(); - std::string dir = "/tmp/test_doris/plugins/jdbc_drivers"; + std::string dir = default_driver_dir(); std::string file_path = dir + "/same-driver.jar"; std::filesystem::create_directories(dir); diff --git a/be/test/exec/operator/agg_operator_test.cpp b/be/test/exec/operator/agg_operator_test.cpp index 945fd0f9f1fc81..ae750013c84423 100644 --- a/be/test/exec/operator/agg_operator_test.cpp +++ b/be/test/exec/operator/agg_operator_test.cpp @@ -379,6 +379,96 @@ TEST_F(AggOperatorTestWithGroupBy, test_need_finalize) { } } +TEST_F(AggOperatorTestWithGroupBy, test_need_finalize_mem_reuse_with_shared_output_columns) { + OperatorContext ctx; + auto sink_op = std::make_shared(); + sink_op->_aggregate_evaluators.push_back(create_mock_agg_fn_evaluator( + ctx.pool, MockSlotRef::create_mock_contexts(1, std::make_shared()), + false, false)); + sink_op->_pool = &ctx.pool; + EXPECT_TRUE(sink_op->prepare(&ctx.state).ok()); + sink_op->_probe_expr_ctxs = + MockSlotRef::create_mock_contexts(0, std::make_shared()); + + auto source_op = std::make_shared(); + source_op->mock_row_descriptor.reset(new MockRowDescriptor { + {std::make_shared(), std::make_shared()}, &ctx.pool}); + source_op->_without_key = false; + source_op->_needs_finalize = true; + EXPECT_TRUE(source_op->prepare(&ctx.state).ok()); + + auto shared_state = init_sink_and_source(sink_op, source_op, ctx); + + { + Block block { + ColumnHelper::create_column_with_name({1, 1, 2, 2, 2, 3}), + ColumnHelper::create_column_with_name({1, 1, 100, 100, 100, 1000})}; + auto st = sink_op->sink(&ctx.state, &block, true); + EXPECT_TRUE(st.ok()) << st.msg(); + } + + Block block {ColumnHelper::create_column_with_name({}), + ColumnHelper::create_column_with_name({})}; + auto old_key_column = block.get_by_position(0).column; + auto old_value_column = block.get_by_position(1).column; + bool eos = false; + auto st = source_op->get_block(&ctx.state, &block, &eos); + ASSERT_TRUE(st.ok()) << st.to_string(); + + EXPECT_TRUE(eos); + EXPECT_EQ(old_key_column->size(), 0); + EXPECT_EQ(old_value_column->size(), 0); + EXPECT_TRUE(ColumnHelper::block_equal( + block, Block {ColumnHelper::create_column_with_name({1, 2, 3}), + ColumnHelper::create_column_with_name({2, 300, 1000})})); +} + +TEST_F(AggOperatorTestWithGroupBy, test_no_need_finalize_mem_reuse_with_shared_output_columns) { + OperatorContext ctx; + auto sink_op = std::make_shared(); + sink_op->_aggregate_evaluators.push_back(create_mock_agg_fn_evaluator( + ctx.pool, MockSlotRef::create_mock_contexts(1, std::make_shared()), + false, false)); + sink_op->_pool = &ctx.pool; + EXPECT_TRUE(sink_op->prepare(&ctx.state).ok()); + sink_op->_probe_expr_ctxs = + MockSlotRef::create_mock_contexts(0, std::make_shared()); + + auto source_op = std::make_shared(); + source_op->mock_row_descriptor.reset(new MockRowDescriptor { + {std::make_shared(), std::make_shared()}, &ctx.pool}); + source_op->_without_key = false; + source_op->_needs_finalize = false; + EXPECT_TRUE(source_op->prepare(&ctx.state).ok()); + + auto shared_state = init_sink_and_source(sink_op, source_op, ctx); + + { + Block block { + ColumnHelper::create_column_with_name({1, 1, 2, 2, 2, 3}), + ColumnHelper::create_column_with_name({1, 1, 100, 100, 100, 1000})}; + auto st = sink_op->sink(&ctx.state, &block, true); + EXPECT_TRUE(st.ok()) << st.msg(); + } + + const auto& aggregate_function = sink_op->_aggregate_evaluators[0]->function(); + auto serialized_type = aggregate_function->get_serialized_type(); + Block block {ColumnHelper::create_column_with_name({}), + ColumnWithTypeAndName(aggregate_function->create_serialize_column(), + serialized_type, "")}; + auto old_key_column = block.get_by_position(0).column; + auto old_value_column = block.get_by_position(1).column; + bool eos = false; + auto st = source_op->get_block(&ctx.state, &block, &eos); + ASSERT_TRUE(st.ok()) << st.to_string(); + + EXPECT_TRUE(eos); + EXPECT_EQ(block.rows(), 3); + EXPECT_EQ(old_key_column->size(), 0); + EXPECT_EQ(old_value_column->size(), 0); + EXPECT_TRUE(check_and_get_column(*block.get_by_position(1).column)); +} + TEST_F(AggOperatorTestWithGroupBy, test_2_phase) { /* group by key | sum(value) diff --git a/be/test/exec/operator/datagen_operator_test.cpp b/be/test/exec/operator/datagen_operator_test.cpp index 2130fe7b4358f0..84b59466f355b8 100644 --- a/be/test/exec/operator/datagen_operator_test.cpp +++ b/be/test/exec/operator/datagen_operator_test.cpp @@ -121,4 +121,37 @@ TEST(DataGenSourceOperatorTest, testConst) { ColumnHelper::create_column({5, 5, 5, 5, 5, 5, 5, 5, 5, 5}))); } +TEST(DataGenSourceOperatorTest, testMemReuseWithSharedOutputColumn) { + OperatorContext ctx; + + DataGenSourceOperatorX op; + + std::vector data_types {std::make_shared()}; + auto row_desc = std::make_unique(data_types, &ctx.pool); + op._tuple_id = 0; + op._tuple_desc = row_desc->tuple_desc_map[0]; + + TDataGenScanRange data_gen_scan_range; + data_gen_scan_range.numbers_params.useConst = false; + data_gen_scan_range.numbers_params.constValue = 0; + data_gen_scan_range.numbers_params.totalNumbers = 10; + + TScanRangeParams scan_range_param; + scan_range_param.scan_range.data_gen_scan_range = data_gen_scan_range; + + OperatorHelper::init_local_state(ctx, op, {scan_range_param}); + + Block block {ColumnHelper::create_column_with_name({})}; + auto old_output_column = block.get_by_position(0).column; + bool eos = false; + auto st = op.get_block(&ctx.state, &block, &eos); + ASSERT_TRUE(st.ok()) << st.to_string(); + + EXPECT_TRUE(eos); + EXPECT_EQ(old_output_column->size(), 0); + EXPECT_TRUE(ColumnHelper::column_equal( + block.get_by_position(0).column, + ColumnHelper::create_column({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}))); +} + } // namespace doris diff --git a/be/test/exec/operator/set_operator_test.cpp b/be/test/exec/operator/set_operator_test.cpp index 7e3bcfef30d3be..80f40d66e49a75 100644 --- a/be/test/exec/operator/set_operator_test.cpp +++ b/be/test/exec/operator/set_operator_test.cpp @@ -17,6 +17,7 @@ #include +#include #include #include @@ -384,6 +385,53 @@ TEST_F(ExceptOperatorTest, test_output_null_batsh_size) { } } +TEST_F(ExceptOperatorTest, test_mem_reuse_with_shared_output_column) { + state->_batch_size = 2; + init_op(2, {std::make_shared()}); + + sink_op->_child_exprs = + MockSlotRef::create_mock_contexts(DataTypes {std::make_shared()}); + probe_sink_ops[0]->_child_exprs = + MockSlotRef::create_mock_contexts(DataTypes {std::make_shared()}); + + init_local_state(); + + { + Block block = ColumnHelper::create_block({1, 2, 3}); + auto st = sink_op->sink(state.get(), &block, true); + EXPECT_TRUE(st.ok()) << st.to_string(); + } + + { + Block block = ColumnHelper::create_block({}); + auto st = probe_sink_ops[0]->sink(states[0].get(), &block, true); + EXPECT_TRUE(st.ok()) << st.to_string(); + } + + Block output {ColumnHelper::create_column_with_name({})}; + auto old_output_column = output.get_by_position(0).column; + + bool eos = false; + std::vector values; + while (!eos) { + auto st = source_op->get_block(state.get(), &output, &eos); + ASSERT_TRUE(st.ok()) << st.to_string(); + ASSERT_GT(output.rows(), 0); + + const auto& column = output.get_by_position(0).column; + for (size_t i = 0; i < column->size(); ++i) { + values.push_back(column->get_int(i)); + } + if (!eos) { + output.clear_column_data(); + } + } + + EXPECT_EQ(old_output_column->size(), 0); + std::sort(values.begin(), values.end()); + EXPECT_EQ(values, std::vector({1, 2, 3})); +} + TEST_F(IntersectOperatorTest, test_sink_large_string_data_over_4g) { // Test that SetSinkOperatorX can handle string data exceeding 4GB total size. // This exercises the convert_column_if_overflow path in _process_build_block. @@ -688,4 +736,4 @@ TEST_F(ExceptOperatorTest, test_refresh_hash_table) { EXPECT_TRUE(block.empty()); } } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/exprs/aggregate/agg_array_agg_test.cpp b/be/test/exprs/aggregate/agg_array_agg_test.cpp index 97b6d99456b02a..6b27a2b55b03fd 100644 --- a/be/test/exprs/aggregate/agg_array_agg_test.cpp +++ b/be/test/exprs/aggregate/agg_array_agg_test.cpp @@ -42,6 +42,7 @@ #include "exprs/aggregate/agg_function_test.h" #include "exprs/aggregate/aggregate_function.h" #include "exprs/aggregate/aggregate_function_simple_factory.h" +#include "exprs/aggregate/aggregate_function_sort.h" #include "gtest/gtest_pred_impl.h" namespace doris { @@ -122,4 +123,102 @@ TEST_F(AggregateFunctionArrayAggTest, test_array_agg_astr_nullable) { ColumnWithTypeAndName(std::move(array_column), array_data_type, "column")); } +TEST_F(AggregateFunctionArrayAggTest, test_array_agg_astr_foreach) { + auto data_type = make_nullable(std::make_shared()); + auto array_data_type = std::make_shared(data_type); + create_agg("array_agg_foreach", false, {array_data_type}, array_data_type); + + auto off_column = ColumnOffset64::create(); + auto data_column = data_type->create_column(); + std::vector offs = {0, 4}; + std::vector vals = {1, 2, 3}; + for (size_t i = 1; i < offs.size(); ++i) { + off_column->insert_data((const char*)(&offs[i]), 0); + } + data_column->insert_default(); + for (auto& v : vals) { + data_column->insert_data((const char*)(&v), sizeof(v)); + } + auto array_column = ColumnArray::create(data_column->clone(), off_column->clone()); + + auto off_column2 = ColumnOffset64::create(); + std::vector offs2 = {0, 1, 2, 3, 4}; + for (size_t i = 1; i < offs2.size(); ++i) { + off_column2->insert_data((const char*)(&offs2[i]), 0); + } + + auto array_array_data_type = std::make_shared(array_data_type); + auto array_array_off_column = ColumnOffset64::create(); + array_array_off_column->insert_value(4); + auto array_array_column = + ColumnArray::create(ColumnArray::create(data_column->clone(), off_column2->clone()), + array_array_off_column->clone()); + + execute(Block({ColumnWithTypeAndName(array_column->clone(), array_data_type, "")}), + ColumnWithTypeAndName(std::move(array_array_column), array_array_data_type, "column")); +} + +TEST_F(AggregateFunctionArrayAggTest, test_array_agg_aint64_foreach) { + auto data_type = make_nullable(std::make_shared()); + auto array_data_type = std::make_shared(data_type); + create_agg("array_agg_foreach", false, {array_data_type}, array_data_type); + + auto off_column = ColumnOffset64::create(); + auto data_column = data_type->create_column(); + std::vector offs = {0, 4}; + std::vector vals = {1, 2, 3}; + for (size_t i = 1; i < offs.size(); ++i) { + off_column->insert_data((const char*)(&offs[i]), 0); + } + data_column->insert_default(); + for (auto& v : vals) { + data_column->insert_data((const char*)(&v), sizeof(v)); + } + auto array_column = ColumnArray::create(data_column->clone(), off_column->clone()); + + auto off_column2 = ColumnOffset64::create(); + std::vector offs2 = {0, 1, 2, 3, 4}; + for (size_t i = 1; i < offs2.size(); ++i) { + off_column2->insert_data((const char*)(&offs2[i]), 0); + } + + auto array_array_data_type = std::make_shared(array_data_type); + auto array_array_off_column = ColumnOffset64::create(); + array_array_off_column->insert_value(4); + auto array_array_column = + ColumnArray::create(ColumnArray::create(data_column->clone(), off_column2->clone()), + array_array_off_column->clone()); + + execute(Block({ColumnWithTypeAndName(array_column->clone(), array_data_type, "")}), + ColumnWithTypeAndName(std::move(array_array_column), array_array_data_type, "column")); +} + +TEST(AggregateFunctionSortDataTest, merge_does_not_share_rhs_block) { + auto data_type = std::make_shared(); + Block prototype({ColumnWithTypeAndName(data_type->create_column(), data_type, "value"), + ColumnWithTypeAndName(data_type->create_column(), data_type, "sort_key")}); + SortDescription sort_desc {SortColumnDescription(1, 1, 1)}; + + AggregateFunctionSortData lhs(sort_desc, prototype); + AggregateFunctionSortData rhs1(sort_desc, prototype); + AggregateFunctionSortData rhs2(sort_desc, prototype); + + auto values = ColumnInt64::create(); + values->insert_value(10); + values->insert_value(20); + auto sort_keys = ColumnInt64::create(); + sort_keys->insert_value(2); + sort_keys->insert_value(1); + const IColumn* row0[] = {values.get(), sort_keys.get()}; + const IColumn* row1[] = {values.get(), sort_keys.get()}; + + rhs1.add(row0, 2, 0); + rhs2.add(row1, 2, 1); + + lhs.merge(rhs1); + ASSERT_NO_THROW(lhs.merge(rhs2)); + ASSERT_EQ(lhs.block.rows(), 2); + ASSERT_EQ(rhs1.block.rows(), 1); +} + } // namespace doris diff --git a/be/test/exprs/function/function_variant_element_test.cpp b/be/test/exprs/function/function_variant_element_test.cpp index c16e2844ad9f44..98265fda3fa76e 100644 --- a/be/test/exprs/function/function_variant_element_test.cpp +++ b/be/test/exprs/function/function_variant_element_test.cpp @@ -40,7 +40,7 @@ TEST(function_variant_element_test, extract_from_sparse_column) { sparse_column_offsets.push_back(sparse_column_keys->size()); variant_ptr->get_subcolumn({})->insert_default(); variant_ptr->set_num_rows(1); - variant_ptr->get_doc_value_column()->assume_mutable()->resize(1); + variant_ptr->get_doc_value_column_mutable().resize(1); ColumnPtr result; ColumnPtr index_column_ptr = ColumnString::create(); @@ -61,4 +61,4 @@ TEST(function_variant_element_test, extract_from_sparse_column) { EXPECT_EQ(result_string, "{\"age\":\"John\",\"name\":\"John\"}"); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/format/json/json_reader_test.cpp b/be/test/format/json/json_reader_test.cpp index 920d3ea0f9f041..c04785f6fcd2b7 100644 --- a/be/test/format/json/json_reader_test.cpp +++ b/be/test/format/json/json_reader_test.cpp @@ -19,6 +19,12 @@ #include +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" #include "format/json/new_json_reader.h" namespace doris { @@ -87,4 +93,79 @@ TEST(NewJsonReaderSetBatchSizeTest, SetBatchSizeViaGenericInterface) { EXPECT_EQ(base_reader->get_batch_size(), 4096U); } +TEST(NewJsonReaderCowTest, AppendNullForMalformedJsonMutatesOwnerColumn) { + auto nested_column = ColumnInt32::create(); + nested_column->insert_value(7); + auto null_map = ColumnUInt8::create(); + null_map->insert_value(0); + ColumnPtr shared_column = ColumnNullable::create(std::move(nested_column), std::move(null_map)); + const auto* original_column = shared_column.get(); + + Block block; + block.insert({shared_column, make_nullable(std::make_shared()), "c0"}); + + ASSERT_TRUE(json_reader_detail::append_null_for_malformed_json(block).ok()); + ASSERT_EQ(block.rows(), 2); + EXPECT_NE(block.get_by_position(0).column.get(), original_column); + + const auto& result_column = + assert_cast(*block.get_by_position(0).column); + EXPECT_FALSE(result_column.is_null_at(0)); + EXPECT_TRUE(result_column.is_null_at(1)); + + const auto& original_nullable = assert_cast(*shared_column); + EXPECT_EQ(original_nullable.size(), 1); + EXPECT_FALSE(original_nullable.is_null_at(0)); +} + +TEST(NewJsonReaderCowTest, TruncateBlockToRowsMutatesOwnerColumn) { + auto nested_column = ColumnInt32::create(); + nested_column->insert_value(7); + nested_column->insert_value(8); + auto null_map = ColumnUInt8::create(); + null_map->insert_value(0); + null_map->insert_value(0); + ColumnPtr shared_column = ColumnNullable::create(std::move(nested_column), std::move(null_map)); + const auto* original_column = shared_column.get(); + + Block block; + block.insert({shared_column, make_nullable(std::make_shared()), "c0"}); + + json_reader_detail::truncate_block_to_rows(block, 1); + ASSERT_EQ(block.rows(), 1); + EXPECT_NE(block.get_by_position(0).column.get(), original_column); + + const auto& result_column = + assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(result_column.size(), 1); + EXPECT_FALSE(result_column.is_null_at(0)); + + const auto& original_nullable = assert_cast(*shared_column); + EXPECT_EQ(original_nullable.size(), 2); +} + +TEST(NewJsonReaderCowTest, PopBackLastInsertedValueMutatesOwnerColumn) { + auto column = ColumnInt32::create(); + column->insert_value(7); + column->insert_value(8); + ColumnPtr shared_column = std::move(column); + const auto* original_column = shared_column.get(); + + Block block; + block.insert({shared_column, std::make_shared(), "c0"}); + + json_reader_detail::pop_back_last_inserted_value(block, 0); + ASSERT_EQ(block.rows(), 1); + EXPECT_NE(block.get_by_position(0).column.get(), original_column); + + const auto& result_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(result_column.size(), 1); + EXPECT_EQ(result_column.get_data()[0], 7); + + const auto& original_int_column = assert_cast(*shared_column); + EXPECT_EQ(original_int_column.size(), 2); + EXPECT_EQ(original_int_column.get_data()[0], 7); + EXPECT_EQ(original_int_column.get_data()[1], 8); +} + } // namespace doris diff --git a/be/test/format/native/native_reader_writer_test.cpp b/be/test/format/native/native_reader_writer_test.cpp index 5d1d7dc207cef7..cf568354925b17 100644 --- a/be/test/format/native/native_reader_writer_test.cpp +++ b/be/test/format/native/native_reader_writer_test.cpp @@ -530,6 +530,7 @@ TEST_F(NativeReaderWriterTest, round_trip_native_file_large_rows) { MutableBlock merged_mutable(&merged_block); Status add_st = merged_mutable.add_rows(&dst_block, 0, read_rows); ASSERT_TRUE(add_st.ok()) << add_st; + merged_block.set_columns(std::move(merged_mutable.mutable_columns())); total_read_rows += read_rows; } } diff --git a/be/test/format/orc/orc_reader_fill_data_test.cpp b/be/test/format/orc/orc_reader_fill_data_test.cpp index eab2b97e38a3bd..84ba8af9da3382 100644 --- a/be/test/format/orc/orc_reader_fill_data_test.cpp +++ b/be/test/format/orc/orc_reader_fill_data_test.cpp @@ -19,6 +19,7 @@ #include +#include "core/assert_cast.h" #include "core/column/column_array.h" #include "core/column/column_struct.h" #include "core/data_type/data_type_array.h" @@ -124,6 +125,43 @@ TEST_F(OrcReaderFillDataTest, TestFillLongColumnWithNull) { } } +TEST_F(OrcReaderFillDataTest, SchemaChangeNullableNullMapUsesAppendedSlice) { + std::vector values = {10, 20, 30}; + std::vector nulls = {true, false, true}; + auto batch = create_long_batch(values.size(), values, nulls); + auto orc_type_ptr = createPrimitiveType(orc::TypeKind::LONG); + + auto nested_column = ColumnFloat64::create(); + nested_column->insert_value(1); + nested_column->insert_value(2); + auto null_map_column = ColumnUInt8::create(); + null_map_column->insert_value(0); + null_map_column->insert_value(0); + ColumnPtr doris_column = + ColumnNullable::create(std::move(nested_column), std::move(null_map_column)); + auto data_type = make_nullable(std::make_shared()); + + TFileScanRangeParams params; + TFileRangeDesc range; + auto reader = OrcReader::create_unique(params, range, 4064, "", nullptr, nullptr, true); + + Status status = reader->_orc_column_to_doris_column( + "test_schema_change_nullable", doris_column, data_type, const_node, orc_type_ptr.get(), + batch.get(), values.size()); + + ASSERT_TRUE(status.ok()) << status.to_string(); + const auto* nullable_column = assert_cast(doris_column.get()); + ASSERT_EQ(nullable_column->size(), 5); + + const auto& null_map = nullable_column->get_null_map_data(); + ASSERT_EQ(null_map.size(), 5); + EXPECT_EQ(null_map[0], 0); + EXPECT_EQ(null_map[1], 0); + EXPECT_EQ(null_map[2], 1); + EXPECT_EQ(null_map[3], 0); + EXPECT_EQ(null_map[4], 1); +} + TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { // Array类型测试 { @@ -478,4 +516,4 @@ TEST_F(OrcReaderFillDataTest, ComplexTypeConversionTest) { "+-------------------+\n"); } } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/format/parquet/parquet_column_convert_test.cpp b/be/test/format/parquet/parquet_column_convert_test.cpp index e50d28ef0e7930..256d1bb3a49bd6 100644 --- a/be/test/format/parquet/parquet_column_convert_test.cpp +++ b/be/test/format/parquet/parquet_column_convert_test.cpp @@ -22,6 +22,9 @@ #include #include +#include "core/assert_cast.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" #include "util/timezone_utils.h" namespace doris::parquet { @@ -38,6 +41,8 @@ static FieldSchema make_timestamp_field_schema(bool is_adjusted_to_utc) { } TEST(ParquetColumnConvertTest, InitFixedOffsetDetection) { + TimezoneUtils::load_timezones_to_cache(); + cctz::time_zone utc_tz; cctz::time_zone plus8_tz; cctz::time_zone shanghai_tz; @@ -119,4 +124,176 @@ TEST(ParquetColumnConvertTest, LookupPathMatchesOriginal) { } } +TEST(ParquetColumnConvertTest, AlignNullMapUsesAppendedSourceSlice) { + auto dst_nested_column = ColumnFloat64::create(); + dst_nested_column->insert_value(1); + dst_nested_column->insert_value(2); + auto dst_null_map_column = ColumnUInt8::create(); + dst_null_map_column->insert_value(0); + dst_null_map_column->insert_value(0); + ColumnPtr dst_column = + ColumnNullable::create(std::move(dst_nested_column), std::move(dst_null_map_column)); + + auto src_nested_column = ColumnInt64::create(); + for (int i = 0; i < 5; ++i) { + src_nested_column->insert_value(i); + } + auto src_null_map_column = ColumnUInt8::create(); + src_null_map_column->insert_value(0); + src_null_map_column->insert_value(0); + src_null_map_column->insert_value(1); + src_null_map_column->insert_value(0); + src_null_map_column->insert_value(1); + ColumnPtr src_column = + ColumnNullable::create(std::move(src_nested_column), std::move(src_null_map_column)); + + align_null_map(src_column, dst_column, 2, 3, 2); + + const auto* nullable_column = assert_cast(dst_column.get()); + const auto& null_map = nullable_column->get_null_map_data(); + ASSERT_EQ(null_map.size(), 5); + EXPECT_EQ(null_map[0], 0); + EXPECT_EQ(null_map[1], 0); + EXPECT_EQ(null_map[2], 1); + EXPECT_EQ(null_map[3], 0); + EXPECT_EQ(null_map[4], 1); +} + +TEST(ParquetColumnConvertTest, AlignNullMapUsesNullablePrefixForCachedReadColumn) { + auto dst_nested_column = ColumnFloat64::create(); + dst_nested_column->insert_value(1); + dst_nested_column->insert_value(2); + auto dst_null_map_column = ColumnUInt8::create(); + dst_null_map_column->insert_value(0); + dst_null_map_column->insert_value(0); + ColumnPtr dst_column = + ColumnNullable::create(std::move(dst_nested_column), std::move(dst_null_map_column)); + + auto src_nested_column = ColumnInt64::create(); + src_nested_column->insert_value(10); + src_nested_column->insert_value(11); + src_nested_column->insert_value(12); + auto src_null_map_column = ColumnUInt8::create(); + src_null_map_column->insert_value(0); + src_null_map_column->insert_value(0); + src_null_map_column->insert_value(1); + src_null_map_column->insert_value(0); + src_null_map_column->insert_value(1); + ColumnPtr src_column = + ColumnNullable::create(std::move(src_nested_column), std::move(src_null_map_column)); + + align_null_map(src_column, dst_column, get_null_map_size_or_inner_column_size(dst_column), 3, + get_appended_null_map_start(src_column, 3)); + + const auto* nullable_column = assert_cast(dst_column.get()); + const auto& null_map = nullable_column->get_null_map_data(); + ASSERT_EQ(null_map.size(), 5); + EXPECT_EQ(null_map[0], 0); + EXPECT_EQ(null_map[1], 0); + EXPECT_EQ(null_map[2], 1); + EXPECT_EQ(null_map[3], 0); + EXPECT_EQ(null_map[4], 1); +} + +TEST(ParquetColumnConvertTest, ConvertNullableFloatToDoubleUsesCurrentSourceNullMapSlice) { + FieldSchema field_schema; + field_schema.name = "float_col"; + field_schema.parquet_schema.__set_name("float_col"); + field_schema.parquet_schema.__set_type(tparquet::Type::FLOAT); + field_schema.data_type = DataTypeFactory::instance().create_data_type(TYPE_FLOAT, true); + + const auto dst_type = DataTypeFactory::instance().create_data_type(TYPE_DOUBLE, true); + auto converter = PhysicalToLogicalConverter::get_converter( + &field_schema, field_schema.data_type, dst_type, nullptr); + ASSERT_TRUE(converter->support()) << converter->get_error_msg(); + + auto src_nested_column = ColumnFloat32::create(); + src_nested_column->insert_value(1.5F); + src_nested_column->insert_value(2.5F); + src_nested_column->insert_value(3.5F); + auto src_null_map_column = ColumnUInt8::create(); + src_null_map_column->insert_value(0); + src_null_map_column->insert_value(1); + src_null_map_column->insert_value(0); + ColumnPtr src_column = + ColumnNullable::create(std::move(src_nested_column), std::move(src_null_map_column)); + + ColumnPtr dst_column = dst_type->create_column(); + ColumnPtr dst_alias = dst_column; + + ASSERT_TRUE(converter->convert(src_column, field_schema.data_type, dst_type, dst_column, false) + .ok()); + + const auto* nullable_column = assert_cast(dst_column.get()); + ASSERT_EQ(nullable_column->size(), 3); + const auto& null_map = nullable_column->get_null_map_data(); + EXPECT_EQ(null_map[0], 0); + EXPECT_EQ(null_map[1], 1); + EXPECT_EQ(null_map[2], 0); + + const auto& nested_column = + assert_cast(nullable_column->get_nested_column()); + EXPECT_DOUBLE_EQ(nested_column.get_data()[0], 1.5); + EXPECT_DOUBLE_EQ(nested_column.get_data()[2], 3.5); + + const auto* original_dst = assert_cast(dst_alias.get()); + EXPECT_EQ(original_dst->size(), 0); +} + +TEST(ParquetColumnConvertTest, + ConvertNullableFixedLengthStringToVarbinaryPreservesExistingDstPrefix) { + FieldSchema field_schema; + field_schema.name = "fixed_binary_col"; + field_schema.parquet_schema.__set_name("fixed_binary_col"); + field_schema.parquet_schema.__set_type(tparquet::Type::FIXED_LEN_BYTE_ARRAY); + field_schema.parquet_schema.__set_type_length(2); + field_schema.data_type = DataTypeFactory::instance().create_data_type(TYPE_STRING, true); + + const auto dst_type = DataTypeFactory::instance().create_data_type(TYPE_VARBINARY, true); + auto converter = PhysicalToLogicalConverter::get_converter( + &field_schema, field_schema.data_type, dst_type, nullptr); + ASSERT_TRUE(converter->support()) << converter->get_error_msg(); + + auto src_nested_column = ColumnUInt8::create(); + for (auto ch : std::string("aabbcc")) { + src_nested_column->insert_value(static_cast(ch)); + } + auto src_null_map_column = ColumnUInt8::create(); + src_null_map_column->insert_value(0); + src_null_map_column->insert_value(1); + src_null_map_column->insert_value(0); + ColumnPtr src_column = + ColumnNullable::create(std::move(src_nested_column), std::move(src_null_map_column)); + + auto dst_nested_column = ColumnVarbinary::create(); + dst_nested_column->insert_data("zz", 2); + auto dst_null_map_column = ColumnUInt8::create(); + dst_null_map_column->insert_value(0); + ColumnPtr dst_column = + ColumnNullable::create(std::move(dst_nested_column), std::move(dst_null_map_column)); + ColumnPtr dst_alias = dst_column; + + ASSERT_TRUE(converter->convert(src_column, field_schema.data_type, dst_type, dst_column, false) + .ok()); + + const auto* nullable_column = assert_cast(dst_column.get()); + ASSERT_EQ(nullable_column->size(), 4); + const auto& null_map = nullable_column->get_null_map_data(); + EXPECT_EQ(null_map[0], 0); + EXPECT_EQ(null_map[1], 0); + EXPECT_EQ(null_map[2], 1); + EXPECT_EQ(null_map[3], 0); + + const auto& nested_column = + assert_cast(nullable_column->get_nested_column()); + ASSERT_EQ(nested_column.size(), 4); + EXPECT_EQ(nested_column.get_data_at(0).to_string(), "zz"); + EXPECT_EQ(nested_column.get_data_at(1).to_string(), "aa"); + EXPECT_EQ(nested_column.get_data_at(3).to_string(), "cc"); + + const auto* original_dst = assert_cast(dst_alias.get()); + ASSERT_EQ(original_dst->size(), 1); + EXPECT_EQ(original_dst->get_data_at(0).to_string(), "zz"); +} + } // namespace doris::parquet diff --git a/be/test/format/parquet/parquet_thrift_test.cpp b/be/test/format/parquet/parquet_thrift_test.cpp index 7171fe3b63cd16..4bbe6dc09e41e9 100644 --- a/be/test/format/parquet/parquet_thrift_test.cpp +++ b/be/test/format/parquet/parquet_thrift_test.cpp @@ -161,8 +161,8 @@ TEST_F(ParquetThriftReaderTest, complex_nested_file) { static int fill_nullable_column(ColumnPtr& doris_column, level_t* definitions, size_t num_values) { CHECK(doris_column->is_nullable()); - auto* nullable_column = - const_cast(static_cast(doris_column.get())); + doris_column = IColumn::mutate(std::move(doris_column)); + auto* nullable_column = assert_cast(doris_column->assume_mutable().get()); NullMap& map_data = nullable_column->get_null_map_data(); int null_cnt = 0; for (int i = 0; i < num_values; ++i) { @@ -193,6 +193,9 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column ColumnPtr src_column = _converter->get_physical_column( field_schema->physical_type, field_schema->data_type, doris_column, data_type, false); + if (_converter->read_directly_into_dst_logical_column()) { + src_column = std::move(doris_column); + } DataTypePtr& resolved_type = _converter->get_physical_type(); io::BufferedFileStreamReader stream_reader(file_reader, start_offset, chunk_size, 1024); @@ -217,10 +220,10 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column if (src_column->is_nullable()) { // fill nullable values fill_nullable_column(src_column, definitions, rows); - auto* nullable_column = - const_cast(static_cast(src_column.get())); + auto* nullable_column = assert_cast(src_column->assume_mutable().get()); data_column = nullable_column->get_nested_column_ptr(); } else { + src_column = IColumn::mutate(std::move(src_column)); data_column = src_column->assume_mutable(); } FilterMap filter_map; diff --git a/be/test/load/delta_writer/delta_writer_cluster_key_test.cpp b/be/test/load/delta_writer/delta_writer_cluster_key_test.cpp index bbfe9e18a35830..c8a40194ff0803 100644 --- a/be/test/load/delta_writer/delta_writer_cluster_key_test.cpp +++ b/be/test/load/delta_writer/delta_writer_cluster_key_test.cpp @@ -214,7 +214,8 @@ static void generate_data(Block* block, int8_t k1, int16_t k2, int32_t seq) { columns[3]->insert_data((const char*)&c4_int, sizeof(c4)); int32_t c5 = seq; - columns[4]->insert_data((const char*)&c5, sizeof(c2)); + columns[4]->insert_data((const char*)&c5, sizeof(c5)); + block->set_columns(std::move(columns)); } class TestDeltaWriterClusterKey : public ::testing::Test { diff --git a/be/test/load/delta_writer/delta_writer_test.cpp b/be/test/load/delta_writer/delta_writer_test.cpp index 08cd0f7c7e579a..0ce52ceea706eb 100644 --- a/be/test/load/delta_writer/delta_writer_test.cpp +++ b/be/test/load/delta_writer/delta_writer_test.cpp @@ -461,15 +461,16 @@ static void generate_data(Block* block, int8_t k1, int16_t k2, int32_t seq) { {"2020-07-16 19:39:43", 19}, c3, nullptr, p); } int64_t c3_int = c3.to_int64(); - columns[2]->insert_data((const char*)&c3_int, sizeof(c3)); + columns[2]->insert_data((const char*)&c3_int, sizeof(c3_int)); DateV2Value c4; c4.unchecked_set_time(2022, 6, 6, 0, 0, 0, 0); uint32_t c4_int = c4.to_date_int_val(); - columns[3]->insert_data((const char*)&c4_int, sizeof(c4)); + columns[3]->insert_data((const char*)&c4_int, sizeof(c4_int)); int32_t c5 = seq; - columns[4]->insert_data((const char*)&c5, sizeof(c2)); + columns[4]->insert_data((const char*)&c5, sizeof(c5)); + block->set_columns(std::move(columns)); } class TestDeltaWriter : public ::testing::Test { @@ -670,6 +671,7 @@ TEST_F(TestDeltaWriter, vec_write) { date_v2_int = date_v2.to_date_int_val(); columns[21]->insert_data((const char*)&date_v2_int, sizeof(date_v2_int)); + block.set_columns(std::move(columns)); res = delta_writer->write(&block, {0}); ASSERT_TRUE(res.ok()); } diff --git a/be/test/load/memtable/memtable_memory_limiter_test.cpp b/be/test/load/memtable/memtable_memory_limiter_test.cpp index 1d5c1238335346..f3566448a1f29b 100644 --- a/be/test/load/memtable/memtable_memory_limiter_test.cpp +++ b/be/test/load/memtable/memtable_memory_limiter_test.cpp @@ -165,6 +165,7 @@ TEST_F(MemTableMemoryLimiterTest, handle_memtable_flush_test) { int32_t k3 = -2147483647; columns[2]->insert_data((const char*)&k3, sizeof(k3)); + block.set_columns(std::move(columns)); res = delta_writer->write(&block, {0}); ASSERT_TRUE(res.ok()); } diff --git a/be/test/runtime/snapshot_loader_test.cpp b/be/test/runtime/snapshot_loader_test.cpp index 6c320d225f5e44..209ab1139a406b 100644 --- a/be/test/runtime/snapshot_loader_test.cpp +++ b/be/test/runtime/snapshot_loader_test.cpp @@ -214,6 +214,7 @@ static void add_rowset(int64_t tablet_id, int32_t schema_hash, int64_t partition auto columns = block.mutate_columns(); int16_t c1 = value; columns[0]->insert_data((const char*)&c1, sizeof(c1)); + block.set_columns(std::move(columns)); Status res = delta_writer->write(&block, {0}); EXPECT_TRUE(res.ok()); diff --git a/be/test/runtime/stream_load_parquet_test.cpp b/be/test/runtime/stream_load_parquet_test.cpp index bf9a35c2a64111..62e280f1e80a64 100644 --- a/be/test/runtime/stream_load_parquet_test.cpp +++ b/be/test/runtime/stream_load_parquet_test.cpp @@ -15,6 +15,10 @@ // specific language governing permissions and limitations // under the License. +#include + +#include + #include "gtest/gtest.h" #include "load/load_path_mgr.h" #include "runtime/exec_env.h" @@ -27,18 +31,19 @@ class LoadPathMgrTest : public testing::Test { _exec_env = ExecEnv::GetInstance(); _load_path_mgr = std::make_unique(_exec_env); - // create tmp file - _test_dir = "/tmp/test_clean_file"; - _test_dir1 = "/tmp/test_clean_file/mini_download"; - _test_dir2 = "/tmp/test_clean_file1/mini_download/test.parquet"; - - auto result = io::global_local_filesystem()->delete_directory_or_file(_test_dir1); - result = io::global_local_filesystem()->create_directory(_test_dir1); - EXPECT_TRUE(result.ok()); + auto test_root = std::filesystem::temp_directory_path() / + ("doris_load_path_mgr_test_" + std::to_string(::getpid())); + _test_dir = test_root.string(); + _test_dir1 = _test_dir + "/mini_download"; + _test_dir2 = _test_dir1 + "/test.parquet"; - result = io::global_local_filesystem()->delete_directory_or_file(_test_dir2); - result = io::global_local_filesystem()->create_directory(_test_dir2); - EXPECT_TRUE(result.ok()); + std::error_code ec; + std::filesystem::remove_all(_test_dir, ec); + ASSERT_FALSE(ec) << ec.message(); + std::filesystem::create_directories(_test_dir1, ec); + ASSERT_FALSE(ec) << ec.message(); + std::filesystem::create_directories(_test_dir2, ec); + ASSERT_FALSE(ec) << ec.message(); const_cast&>(_exec_env->store_paths()).emplace_back(_test_dir, 1024); } @@ -46,6 +51,9 @@ class LoadPathMgrTest : public testing::Test { void TearDown() override { const_cast&>(_exec_env->store_paths()).clear(); _load_path_mgr->stop(); + std::error_code ec; + std::filesystem::remove_all(_test_dir, ec); + EXPECT_FALSE(ec) << ec.message(); _exec_env->destroy(); } @@ -96,4 +104,4 @@ TEST_F(LoadPathMgrTest, CheckDiskSpaceTest) { EXPECT_FALSE(exists); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/storage/adaptive_thread_pool_controller_test.cpp b/be/test/storage/adaptive_thread_pool_controller_test.cpp index 06d79629330e14..6c4e42fea05897 100644 --- a/be/test/storage/adaptive_thread_pool_controller_test.cpp +++ b/be/test/storage/adaptive_thread_pool_controller_test.cpp @@ -19,6 +19,7 @@ #include +#include #include #include "common/config.h" @@ -44,15 +45,19 @@ class AdaptiveThreadPoolControllerTest : public testing::Test { void SetUp() override { _original_enable_adaptive = config::enable_adaptive_flush_threads; + int num_cpus = std::thread::hardware_concurrency(); + if (num_cpus <= 0) num_cpus = 1; + int max_threads = std::max(64, num_cpus * 4); + ASSERT_TRUE(ThreadPoolBuilder("TestPool") .set_min_threads(2) - .set_max_threads(64) + .set_max_threads(max_threads) .build(&_pool) .ok()); ASSERT_TRUE(ThreadPoolBuilder("TestPool2") .set_min_threads(2) - .set_max_threads(64) + .set_max_threads(max_threads) .build(&_pool2) .ok()); } diff --git a/be/test/storage/compaction/ordered_data_compaction_test.cpp b/be/test/storage/compaction/ordered_data_compaction_test.cpp index fa050f6a68b40e..2f3d654023a825 100644 --- a/be/test/storage/compaction/ordered_data_compaction_test.cpp +++ b/be/test/storage/compaction/ordered_data_compaction_test.cpp @@ -317,6 +317,7 @@ class OrderedDataCompactionTest : public ::testing::Test { } num_rows++; } + block.set_columns(std::move(columns)); auto s = rowset_writer->add_block(&block); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); @@ -586,6 +587,7 @@ TEST_F(OrderedDataCompactionTest, test_index_disk_size) { } num_rows++; } + block.set_columns(std::move(columns)); auto s = rowset_writer->add_block(&block); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); diff --git a/be/test/storage/compaction/segcompaction_mow_test.cpp b/be/test/storage/compaction/segcompaction_mow_test.cpp index 760a5d953aa693..13b836cd5a38d3 100644 --- a/be/test/storage/compaction/segcompaction_mow_test.cpp +++ b/be/test/storage/compaction/segcompaction_mow_test.cpp @@ -103,6 +103,14 @@ class SegCompactionMoWTest : public ::testing::TestWithParam { protected: OlapReaderStatistics _stats; + Status add_block_with_columns(RowsetWriter* rowset_writer, Block* block, + MutableColumns* columns) { + block->set_columns(std::move(*columns)); + auto st = rowset_writer->add_block(block); + *columns = block->mutate_columns(); + return st; + } + bool check_dir(std::vector& vec) { std::vector result; for (const auto& entry : std::filesystem::directory_iterator(lTestDir)) { @@ -358,7 +366,7 @@ TEST_P(SegCompactionMoWTest, SegCompactionThenRead) { } } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -458,7 +466,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -486,7 +494,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -514,7 +522,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -542,7 +550,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -573,7 +581,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { } unique_keys.emplace(k1, rid); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -610,7 +618,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -688,7 +696,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -716,7 +724,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -744,7 +752,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -772,7 +780,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -800,7 +808,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -873,7 +881,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionNotTrigger) { rows_mark_deleted++; } } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); diff --git a/be/test/storage/compaction/segcompaction_test.cpp b/be/test/storage/compaction/segcompaction_test.cpp index 6c43fea684cb43..15dc86c89d74b2 100644 --- a/be/test/storage/compaction/segcompaction_test.cpp +++ b/be/test/storage/compaction/segcompaction_test.cpp @@ -124,6 +124,14 @@ class SegCompactionTest : public testing::Test { protected: OlapReaderStatistics _stats; + Status add_block_with_columns(RowsetWriter* rowset_writer, Block* block, + MutableColumns* columns) { + block->set_columns(std::move(*columns)); + auto st = rowset_writer->add_block(block); + *columns = block->mutate_columns(); + return st; + } + bool check_dir(std::vector& vec) { std::vector result; for (const auto& entry : std::filesystem::directory_iterator(lTestDir)) { @@ -316,7 +324,7 @@ TEST_F(SegCompactionTest, SegCompactionThenRead) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -437,7 +445,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -455,7 +463,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -473,7 +481,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -491,7 +499,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -509,7 +517,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -528,7 +536,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -591,7 +599,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -609,7 +617,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -627,7 +635,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -645,7 +653,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -663,7 +671,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); } - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -730,7 +738,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadUniqueTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -754,7 +762,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadUniqueTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -779,7 +787,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadUniqueTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -804,7 +812,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadUniqueTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -817,7 +825,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadUniqueTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -830,7 +838,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadUniqueTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -997,7 +1005,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadAggTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -1021,7 +1029,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadAggTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -1046,7 +1054,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadAggTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -1071,7 +1079,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadAggTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -1084,7 +1092,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadAggTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); @@ -1097,7 +1105,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadAggTableSmall) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); columns[2]->insert_data((const char*)&k3, sizeof(k3)); - s = rowset_writer->add_block(&block); + s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_EQ(Status::OK(), s); diff --git a/be/test/storage/compaction/vertical_compaction_test.cpp b/be/test/storage/compaction/vertical_compaction_test.cpp index 58e0cec0762753..1b4abe72257e76 100644 --- a/be/test/storage/compaction/vertical_compaction_test.cpp +++ b/be/test/storage/compaction/vertical_compaction_test.cpp @@ -106,6 +106,14 @@ class VerticalCompactionTest : public ::testing::Test { ExecEnv::GetInstance()->set_storage_engine(nullptr); } + Status add_block_with_columns(RowsetWriter* rowset_writer, Block* block, + MutableColumns* columns) { + block->set_columns(std::move(*columns)); + auto st = rowset_writer->add_block(block); + *columns = block->mutate_columns(); + return st; + } + TabletSchemaSPtr create_schema(KeysType keys_type = DUP_KEYS, bool without_key = false) { TabletSchemaSPtr tablet_schema = std::make_shared(); TabletSchemaPB tablet_schema_pb; @@ -254,7 +262,7 @@ class VerticalCompactionTest : public ::testing::Test { } num_rows++; } - auto s = rowset_writer->add_block(&block); + auto s = add_block_with_columns(rowset_writer.get(), &block, &columns); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); EXPECT_TRUE(s.ok()); @@ -1222,7 +1230,7 @@ TEST_F(VerticalCompactionTest, TestUniqueKeyVerticalMergeWithNullableSparseColum columns[2]->insert_data((const char*)&delete_sign, sizeof(delete_sign)); } - auto s = rowset_writer->add_block(&block); + auto s = add_block_with_columns(rowset_writer.get(), &block, &columns); ASSERT_TRUE(s.ok()) << s; s = rowset_writer->flush(); ASSERT_TRUE(s.ok()) << s; @@ -1387,7 +1395,7 @@ TEST_F(VerticalCompactionTest, TestFooterRawDataBytesAccuracy) { columns[0]->insert_data(reinterpret_cast(&int_val), sizeof(int_val)); columns[1]->insert_data(fixed_string.data(), fixed_string.size()); } - ASSERT_TRUE(rowset_writer->add_block(&block).ok()); + ASSERT_TRUE(add_block_with_columns(rowset_writer.get(), &block, &columns).ok()); ASSERT_TRUE(rowset_writer->flush().ok()); RowsetSharedPtr rowset; @@ -1488,7 +1496,7 @@ TEST_F(VerticalCompactionTest, TestFooterRawDataBytesNullableSparse) { columns[1]->insert_default(); // ColumnNullable default is null } } - ASSERT_TRUE(rowset_writer->add_block(&block).ok()); + ASSERT_TRUE(add_block_with_columns(rowset_writer.get(), &block, &columns).ok()); ASSERT_TRUE(rowset_writer->flush().ok()); RowsetSharedPtr rowset; diff --git a/be/test/storage/index/date_bloom_filter_test.cpp b/be/test/storage/index/date_bloom_filter_test.cpp index 636e7a6848d39b..261c49a92d6595 100644 --- a/be/test/storage/index/date_bloom_filter_test.cpp +++ b/be/test/storage/index/date_bloom_filter_test.cpp @@ -146,6 +146,7 @@ TEST_F(DateBloomFilterTest, query_index_test) { olap_datetime_value = datetime.to_olap_datetime(); columns[0]->insert_many_fix_len_data(reinterpret_cast(&olap_date_value), 1); columns[1]->insert_many_fix_len_data(reinterpret_cast(&olap_datetime_value), 1); + block.set_columns(std::move(columns)); Status st; st = rowset_writer->add_block(&block); @@ -240,6 +241,7 @@ TEST_F(DateBloomFilterTest, in_list_predicate_test) { olap_datetime_value = datetime.to_olap_datetime(); columns[0]->insert_many_fix_len_data(reinterpret_cast(&olap_date_value), 1); columns[1]->insert_many_fix_len_data(reinterpret_cast(&olap_datetime_value), 1); + block.set_columns(std::move(columns)); EXPECT_TRUE(rowset_writer->add_block(&block).ok()); EXPECT_TRUE(rowset_writer->flush().ok()); diff --git a/be/test/storage/index/index_builder_test.cpp b/be/test/storage/index/index_builder_test.cpp index c281fd511477fb..96cc6839390e3a 100644 --- a/be/test/storage/index/index_builder_test.cpp +++ b/be/test/storage/index/index_builder_test.cpp @@ -257,6 +257,8 @@ TEST_F(IndexBuilderTest, DropInvertedIndexTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -545,6 +547,8 @@ TEST_F(IndexBuilderTest, BuildInvertedIndexAfterWritingDataTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -874,6 +878,8 @@ TEST_F(IndexBuilderTest, AddIndexWhenOneExistsTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add block to rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -1042,6 +1048,8 @@ TEST_F(IndexBuilderTest, AddIndexWhenOneExistsTestV1) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add block to rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -1191,6 +1199,8 @@ TEST_F(IndexBuilderTest, MultiSegmentBuildIndexTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -1341,6 +1351,8 @@ TEST_F(IndexBuilderTest, NonExistentColumnIndexTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -1515,6 +1527,8 @@ TEST_F(IndexBuilderTest, RenameColumnIndexTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -1669,6 +1683,8 @@ TEST_F(IndexBuilderTest, AddNonExistentColumnIndexWhenOneExistsTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -1841,6 +1857,8 @@ TEST_F(IndexBuilderTest, AddNonExistentColumnIndexWhenOneExistsTestV1) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add block to rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -1991,6 +2009,8 @@ TEST_F(IndexBuilderTest, NonNullIndexDataTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -2115,6 +2135,8 @@ TEST_F(IndexBuilderTest, NonExistentColumnUniqueIdTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -2246,6 +2268,8 @@ TEST_F(IndexBuilderTest, DropIndexV1FormatTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -2370,6 +2394,8 @@ TEST_F(IndexBuilderTest, ResourceCleanupTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -2535,6 +2561,8 @@ TEST_F(IndexBuilderTest, ArrayTypeIndexTest) { array_col.insert(Field::create_field(arr)); } + block.set_columns(std::move(columns)); + // Add block to rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -2631,6 +2659,8 @@ TEST_F(IndexBuilderTest, UniqueKeysTableIndexTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -2789,6 +2819,8 @@ TEST_F(IndexBuilderTest, HandleSingleRowsetErrorTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -2909,6 +2941,8 @@ TEST_F(IndexBuilderTest, UpdateInvertedIndexInfoErrorTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); @@ -3024,6 +3058,8 @@ TEST_F(IndexBuilderTest, DropOneIndexNotAffectOtherIndexesOnSameColumnTest) { columns[1]->insert_data((const char*)&k2, sizeof(k2)); } + block.set_columns(std::move(columns)); + // Add the block to the rowset Status s = rowset_writer->add_block(&block); ASSERT_TRUE(s.ok()) << s.to_string(); diff --git a/be/test/storage/index/inverted/common/inverted_index_gc_binlogs_test.cpp b/be/test/storage/index/inverted/common/inverted_index_gc_binlogs_test.cpp index 1512212d6f24d0..c9856eeaa53ec1 100644 --- a/be/test/storage/index/inverted/common/inverted_index_gc_binlogs_test.cpp +++ b/be/test/storage/index/inverted/common/inverted_index_gc_binlogs_test.cpp @@ -154,6 +154,7 @@ TEST_F(IndexGcBinglogsTest, gc_binlogs_test) { Field v1 = Field::create_field("v1"); columns[0]->insert(key); columns[1]->insert(v1); + block.set_columns(std::move(columns)); EXPECT_TRUE(rowset_writer->add_block(&block).ok()); EXPECT_TRUE(rowset_writer->flush().ok()); diff --git a/be/test/storage/index/inverted/compaction/util/index_compaction_utils.cpp b/be/test/storage/index/inverted/compaction/util/index_compaction_utils.cpp index 2a59fb86acc5e8..b253a7f8d2d985 100644 --- a/be/test/storage/index/inverted/compaction/util/index_compaction_utils.cpp +++ b/be/test/storage/index/inverted/compaction/util/index_compaction_utils.cpp @@ -697,6 +697,8 @@ class IndexCompactionUtils { } } + block.set_columns(std::move(columns)); + Status st = rowset_writer->add_block(&block); EXPECT_TRUE(st.ok()) << st.to_string(); st = rowset_writer->flush(); @@ -758,4 +760,4 @@ class IndexCompactionUtils { } }; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/storage/rowid_conversion_test.cpp b/be/test/storage/rowid_conversion_test.cpp index 3ec611ca430cf6..0d470d1c7a7e74 100644 --- a/be/test/storage/rowid_conversion_test.cpp +++ b/be/test/storage/rowid_conversion_test.cpp @@ -205,6 +205,7 @@ class TestRowIdConversion : public testing::TestWithParamadd_block(&block); EXPECT_TRUE(s.ok()); s = rowset_writer->flush(); diff --git a/be/test/storage/segment/segment_cache_test.cpp b/be/test/storage/segment/segment_cache_test.cpp index 82bfe8242411e2..04b395f0089c89 100644 --- a/be/test/storage/segment/segment_cache_test.cpp +++ b/be/test/storage/segment/segment_cache_test.cpp @@ -198,15 +198,16 @@ static void generate_data(Block* block, int8_t k1, int16_t k2, int32_t seq) { {"2020-07-16 19:39:43", 19}, c3, nullptr, p); } int64_t c3_int = c3.to_int64(); - columns[2]->insert_data((const char*)&c3_int, sizeof(c3)); + columns[2]->insert_data((const char*)&c3_int, sizeof(c3_int)); DateV2Value c4; c4.unchecked_set_time(2022, 6, 6, 0, 0, 0, 0); uint32_t c4_int = c4.to_date_int_val(); - columns[3]->insert_data((const char*)&c4_int, sizeof(c4)); + columns[3]->insert_data((const char*)&c4_int, sizeof(c4_int)); int32_t c5 = seq; - columns[4]->insert_data((const char*)&c5, sizeof(c2)); + columns[4]->insert_data((const char*)&c5, sizeof(c5)); + block->set_columns(std::move(columns)); } class SegmentCacheTest : public ::testing::Test { diff --git a/be/test/storage/segment/segments_key_bounds_truncation_test.cpp b/be/test/storage/segment/segments_key_bounds_truncation_test.cpp index 0236ac98c9f7be..b9cad3c63b3eb7 100644 --- a/be/test/storage/segment/segments_key_bounds_truncation_test.cpp +++ b/be/test/storage/segment/segments_key_bounds_truncation_test.cpp @@ -188,6 +188,7 @@ class SegmentsKeyBoundsTruncationTest : public testing::Test { columns[1]->insert_data(reinterpret_cast(&const_value), sizeof(const_value)); } + block.set_columns(std::move(columns)); ret.emplace_back(std::move(block)); } return ret; diff --git a/be/test/storage/segment/variant_column_writer_reader_test.cpp b/be/test/storage/segment/variant_column_writer_reader_test.cpp index 3a644cc373f896..37b6887ab54500 100644 --- a/be/test/storage/segment/variant_column_writer_reader_test.cpp +++ b/be/test/storage/segment/variant_column_writer_reader_test.cpp @@ -82,6 +82,38 @@ static void construct_tablet_index(TabletIndexPB* tablet_index, int64_t index_id tablet_index->add_col_unique_id(col_unique_id); } +static void fill_nullable_variant_block(Block* block, + std::unordered_map* inserted_jsonstr, + variant_util::PathToNoneNullValues* path_with_size) { + MutableColumnPtr column = IColumn::mutate(block->get_by_position(0).column); + auto* nullable_object = assert_cast(column.get()); + for (int idx = 0; idx < 10; idx++) { + nullable_object->insert_default(); // insert null + { + auto column_object = nullable_object->get_nested_column_ptr(); + auto res = VariantUtil::fill_object_column_with_test_data(column_object, 80, + inserted_jsonstr); + path_with_size->insert(res.begin(), res.end()); + } + for (int j = 0; j < 80; ++j) { + Field f = Field::create_field(UInt8(0)); + nullable_object->get_null_map_column().insert(f); + } + nullable_object->insert_many_defaults(17); + { + auto column_object = nullable_object->get_nested_column_ptr(); + auto res = VariantUtil::fill_object_column_with_test_data(column_object, 2, + inserted_jsonstr); + path_with_size->insert(res.begin(), res.end()); + } + for (int j = 0; j < 2; ++j) { + Field f = Field::create_field(UInt8(0)); + nullable_object->get_null_map_column().insert(f); + } + } + block->replace_by_position(0, std::move(column)); +} + // MockColumnReaderCache class for testing class MockColumnReaderCache : public segment_v2::ColumnReaderCache { public: @@ -2627,28 +2659,9 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_nullable) { auto olap_data_convertor = std::make_unique(); // here is nullable variant auto block = _tablet_schema->create_block(); - auto nullable_object = assert_cast( - (*std::move(block.get_by_position(0).column)).mutate().get()); std::unordered_map inserted_jsonstr; - auto column_object = nullable_object->get_nested_column_ptr(); variant_util::PathToNoneNullValues path_with_size; - for (int idx = 0; idx < 10; idx++) { - nullable_object->insert_default(); // insert null - auto res = VariantUtil::fill_object_column_with_test_data(column_object, 80, - &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 80; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - nullable_object->insert_many_defaults(17); - res = VariantUtil::fill_object_column_with_test_data(column_object, 2, &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 2; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - } + fill_nullable_variant_block(&block, &inserted_jsonstr, &path_with_size); // sort path_with_size with value olap_data_convertor->add_column_data_convertor(column); olap_data_convertor->set_source_content(&block, 0, 1000); @@ -2780,28 +2793,9 @@ TEST_F(VariantColumnWriterReaderTest, test_write_data_nullable_without_finalize) auto olap_data_convertor = std::make_unique(); // here is nullable variant auto block = _tablet_schema->create_block(); - auto nullable_object = assert_cast( - (*std::move(block.get_by_position(0).column)).mutate().get()); std::unordered_map inserted_jsonstr; - auto column_object = nullable_object->get_nested_column_ptr(); variant_util::PathToNoneNullValues path_with_size; - for (int idx = 0; idx < 10; idx++) { - nullable_object->insert_default(); // insert null - auto res = VariantUtil::fill_object_column_with_test_data(column_object, 80, - &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 80; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - nullable_object->insert_many_defaults(17); - res = VariantUtil::fill_object_column_with_test_data(column_object, 2, &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 2; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - } + fill_nullable_variant_block(&block, &inserted_jsonstr, &path_with_size); // sort path_with_size with value olap_data_convertor->add_column_data_convertor(column); olap_data_convertor->set_source_content(&block, 0, 1000); @@ -2875,28 +2869,9 @@ TEST_F(VariantColumnWriterReaderTest, test_write_bm_with_finalize) { auto olap_data_convertor = std::make_unique(); // here is nullable variant auto block = _tablet_schema->create_block(); - auto nullable_object = assert_cast( - (*std::move(block.get_by_position(0).column)).mutate().get()); std::unordered_map inserted_jsonstr; - auto column_object = nullable_object->get_nested_column_ptr(); variant_util::PathToNoneNullValues path_with_size; - for (int idx = 0; idx < 10; idx++) { - nullable_object->insert_default(); // insert null - auto res = VariantUtil::fill_object_column_with_test_data(column_object, 80, - &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 80; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - nullable_object->insert_many_defaults(17); - res = VariantUtil::fill_object_column_with_test_data(column_object, 2, &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 2; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - } + fill_nullable_variant_block(&block, &inserted_jsonstr, &path_with_size); // sort path_with_size with value olap_data_convertor->add_column_data_convertor(column); olap_data_convertor->set_source_content(&block, 0, 1000); @@ -2970,28 +2945,9 @@ TEST_F(VariantColumnWriterReaderTest, test_write_bf_with_finalize) { auto olap_data_convertor = std::make_unique(); // here is nullable variant auto block = _tablet_schema->create_block(); - auto nullable_object = assert_cast( - (*std::move(block.get_by_position(0).column)).mutate().get()); std::unordered_map inserted_jsonstr; - auto column_object = nullable_object->get_nested_column_ptr(); variant_util::PathToNoneNullValues path_with_size; - for (int idx = 0; idx < 10; idx++) { - nullable_object->insert_default(); // insert null - auto res = VariantUtil::fill_object_column_with_test_data(column_object, 80, - &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 80; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - nullable_object->insert_many_defaults(17); - res = VariantUtil::fill_object_column_with_test_data(column_object, 2, &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 2; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - } + fill_nullable_variant_block(&block, &inserted_jsonstr, &path_with_size); // sort path_with_size with value olap_data_convertor->add_column_data_convertor(column); olap_data_convertor->set_source_content(&block, 0, 1000); @@ -3067,28 +3023,9 @@ TEST_F(VariantColumnWriterReaderTest, test_write_zm_with_finalize) { auto olap_data_convertor = std::make_unique(); // here is nullable variant auto block = _tablet_schema->create_block(); - auto nullable_object = assert_cast( - (*std::move(block.get_by_position(0).column)).mutate().get()); std::unordered_map inserted_jsonstr; - auto column_object = nullable_object->get_nested_column_ptr(); variant_util::PathToNoneNullValues path_with_size; - for (int idx = 0; idx < 10; idx++) { - nullable_object->insert_default(); // insert null - auto res = VariantUtil::fill_object_column_with_test_data(column_object, 80, - &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 80; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - nullable_object->insert_many_defaults(17); - res = VariantUtil::fill_object_column_with_test_data(column_object, 2, &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 2; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - } + fill_nullable_variant_block(&block, &inserted_jsonstr, &path_with_size); // sort path_with_size with value olap_data_convertor->add_column_data_convertor(column); olap_data_convertor->set_source_content(&block, 0, 1000); @@ -3164,28 +3101,9 @@ TEST_F(VariantColumnWriterReaderTest, test_write_inverted_with_finalize) { auto olap_data_convertor = std::make_unique(); // here is nullable variant auto block = _tablet_schema->create_block(); - auto nullable_object = assert_cast( - (*std::move(block.get_by_position(0).column)).mutate().get()); std::unordered_map inserted_jsonstr; - auto column_object = nullable_object->get_nested_column_ptr(); variant_util::PathToNoneNullValues path_with_size; - for (int idx = 0; idx < 10; idx++) { - nullable_object->insert_default(); // insert null - auto res = VariantUtil::fill_object_column_with_test_data(column_object, 80, - &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 80; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - nullable_object->insert_many_defaults(17); - res = VariantUtil::fill_object_column_with_test_data(column_object, 2, &inserted_jsonstr); - path_with_size.insert(res.begin(), res.end()); - for (int j = 0; j < 2; ++j) { - Field f = Field::create_field(UInt8(0)); - nullable_object->get_null_map_column_ptr()->insert(f); - } - } + fill_nullable_variant_block(&block, &inserted_jsonstr, &path_with_size); // sort path_with_size with value olap_data_convertor->add_column_data_convertor(column); olap_data_convertor->set_source_content(&block, 0, 1000); @@ -3712,7 +3630,7 @@ TEST_F(VariantColumnWriterReaderTest, test_nested_iter) { // fill with nullable ColumnVariant target MutableColumnPtr new_column_object1 = ColumnVariant::create(3, false); MutableColumnPtr null_object = - ColumnNullable::create(new_column_object1->assume_mutable(), ColumnUInt8::create()); + ColumnNullable::create(std::move(new_column_object1), ColumnUInt8::create()); size_t n = 1000; st = nested_iter->seek_to_ordinal(0); EXPECT_TRUE(st.ok()) << st.msg(); @@ -3723,8 +3641,8 @@ TEST_F(VariantColumnWriterReaderTest, test_nested_iter) { { // fill with nullable ColumnVariant target MutableColumnPtr new_column_object12 = ColumnVariant::create(3, false); - MutableColumnPtr null_object12 = ColumnNullable::create( - new_column_object12->assume_mutable(), ColumnUInt8::create()); + MutableColumnPtr null_object12 = + ColumnNullable::create(std::move(new_column_object12), ColumnUInt8::create()); st = nested_iter->seek_to_ordinal(0); EXPECT_TRUE(st.ok()) << st.msg(); st = nested_iter->next_batch(&n, null_object12, &has_null); @@ -3756,7 +3674,7 @@ TEST_F(VariantColumnWriterReaderTest, test_nested_iter) { // fill with nullable ColumnVariant target MutableColumnPtr new_column_object2 = ColumnVariant::create(3, false); MutableColumnPtr null_object2 = - ColumnNullable::create(new_column_object2->assume_mutable(), ColumnUInt8::create()); + ColumnNullable::create(std::move(new_column_object2), ColumnUInt8::create()); size_t nrows = 1000; st = nested_iter2->seek_to_ordinal(0); EXPECT_TRUE(st.ok()) << st.msg(); @@ -3866,7 +3784,7 @@ TEST_F(VariantColumnWriterReaderTest, test_nested_iter_nullable) { // fill with nullable ColumnVariant target MutableColumnPtr new_column_object1 = ColumnVariant::create(3, false); MutableColumnPtr null_object = - ColumnNullable::create(new_column_object1->assume_mutable(), ColumnUInt8::create()); + ColumnNullable::create(std::move(new_column_object1), ColumnUInt8::create()); size_t nrows = 1000; st = nested_iter->seek_to_ordinal(0); EXPECT_TRUE(st.ok()) << st.msg(); diff --git a/be/test/storage/segment/variant_util_test.cpp b/be/test/storage/segment/variant_util_test.cpp index 902bf9c843b115..9f09addabff29a 100644 --- a/be/test/storage/segment/variant_util_test.cpp +++ b/be/test/storage/segment/variant_util_test.cpp @@ -25,8 +25,11 @@ #include "common/config.h" #include "core/block/block.h" +#include "core/column/column_nullable.h" #include "core/column/column_string.h" #include "core/column/column_variant.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_variant.h" #include "core/field.h" #include "core/value/jsonb_value.h" @@ -473,6 +476,37 @@ TEST(VariantUtilTest, ParseVariantColumns_ScalarJsonStringToSubcolumns) { EXPECT_EQ(f.field.get(), 2); } +TEST(VariantUtilTest, ParseNullableScalarVariantDetachesNestedAlias) { + auto variant = ColumnVariant::create(0, false); + doris::VariantUtil::insert_root_scalar_field(*variant, Field::create_field(123)); + ColumnPtr variant_ptr = std::move(variant); + + auto null_map = ColumnUInt8::create(); + null_map->insert_value(0); + ColumnPtr nullable_variant = ColumnNullable::create(variant_ptr, null_map->get_ptr()); + variant_ptr.reset(); + ColumnPtr nullable_alias = nullable_variant; + + Block block; + block.insert( + {nullable_variant, make_nullable(std::make_shared(0, false)), "v"}); + + ParseConfig parse_cfg; + parse_cfg.deprecated_enable_flatten_nested = false; + parse_cfg.parse_to = ParseConfig::ParseTo::OnlySubcolumns; + Status st = + parse_and_materialize_variant_columns(block, std::vector {0}, {parse_cfg}); + EXPECT_TRUE(st.ok()) << st.to_string(); + + const auto& alias_nullable = assert_cast(*nullable_alias); + const auto& alias_variant = + assert_cast(alias_nullable.get_nested_column()); + EXPECT_TRUE(alias_variant.is_scalar_variant()); + EXPECT_EQ(alias_variant.get_root_type()->get_primitive_type(), PrimitiveType::TYPE_INT); + + EXPECT_TRUE(block.get_by_position(0).column->is_nullable()); +} + TEST(VariantUtilTest, ParseVariantColumns_DocModeBinaryToSubcolumns) { const std::vector jsons = { R"({"a":1,"b":"x"})", // diff --git a/be/test/storage/tablet/tablet_cooldown_test.cpp b/be/test/storage/tablet/tablet_cooldown_test.cpp index b919aa887834e7..acf16442537fbd 100644 --- a/be/test/storage/tablet/tablet_cooldown_test.cpp +++ b/be/test/storage/tablet/tablet_cooldown_test.cpp @@ -344,7 +344,7 @@ static void write_rowset(TabletSharedPtr* tablet, PUniqueId load_id, int64_t rep columns[1]->insert_data((const char*)&c2, sizeof(c2)); int32_t c3 = 1; - columns[2]->insert_data((const char*)&c3, sizeof(c2)); + columns[2]->insert_data((const char*)&c3, sizeof(c3)); VecDateTimeValue c4; { @@ -354,8 +354,9 @@ static void write_rowset(TabletSharedPtr* tablet, PUniqueId load_id, int64_t rep {"2020-07-16 19:39:43", 19}, c4, nullptr, p); } int64_t c4_int = c4.to_int64(); - columns[3]->insert_data((const char*)&c4_int, sizeof(c4)); + columns[3]->insert_data((const char*)&c4_int, sizeof(c4_int)); + block.set_columns(std::move(columns)); st = delta_writer->write(&block, {0}); ASSERT_EQ(Status::OK(), st); } diff --git a/be/test/util/bit_util_test.cpp b/be/test/util/bit_util_test.cpp index af106e3a26011c..6bebd7289182ec 100644 --- a/be/test/util/bit_util_test.cpp +++ b/be/test/util/bit_util_test.cpp @@ -63,14 +63,14 @@ TEST(BitUtil, BigEndianToHost) { void insert_true(ColumnNullable* column, size_t num = 1) { for (int i = 0; i < num; i++) { - assert_cast(column->get_nested_column_ptr().get())->insert_value(1); + assert_cast(column->get_nested_column()).insert_value(1); column->push_false_to_nullmap(1); } } void insert_false(ColumnNullable* column, size_t num = 1) { for (int i = 0; i < num; i++) { - assert_cast(column->get_nested_column_ptr().get())->insert_value(0); + assert_cast(column->get_nested_column()).insert_value(0); column->push_false_to_nullmap(1); } } @@ -102,16 +102,12 @@ TEST(BitUtil, CountZero) { insert_false(column.get(), 54); insert_true(column.get(), 1); insert_false(column.get(), 14); + const auto& nested_data = + assert_cast(column->get_nested_column()).get_data(); ASSERT_EQ( - brute_force_count_zero_num( - assert_cast(column->get_nested_column_ptr().get()) - ->get_data() - .data(), - column->get_null_map_data().data(), column->size()), - simd::count_zero_num((int8_t*)assert_cast( - column->get_nested_column_ptr().get()) - ->get_data() - .data(), + brute_force_count_zero_num(nested_data.data(), column->get_null_map_data().data(), + column->size()), + simd::count_zero_num((int8_t*)nested_data.data(), column->get_null_map_data().data(), (uint32_t)column->size())); } @@ -131,16 +127,12 @@ TEST(BitUtil, CountZero) { } } } + const auto& nested_data = + assert_cast(column->get_nested_column()).get_data(); ASSERT_EQ( - brute_force_count_zero_num( - assert_cast(column->get_nested_column_ptr().get()) - ->get_data() - .data(), - column->get_null_map_data().data(), column->size()), - simd::count_zero_num((int8_t*)assert_cast( - column->get_nested_column_ptr().get()) - ->get_data() - .data(), + brute_force_count_zero_num(nested_data.data(), column->get_null_map_data().data(), + column->size()), + simd::count_zero_num((int8_t*)nested_data.data(), column->get_null_map_data().data(), (uint32_t)column->size())); } } diff --git a/docs/dev/be-cow-assume-mutable-audit.md b/docs/dev/be-cow-assume-mutable-audit.md new file mode 100644 index 00000000000000..bd7e01267b5a7f --- /dev/null +++ b/docs/dev/be-cow-assume-mutable-audit.md @@ -0,0 +1,397 @@ +# BE COW assume_mutable audit + +Snapshot: 2026-05-09, branch `cow`. + +`assume_mutable()` is now an ownership assertion. A valid call must have a +nearby proof that the referenced `ColumnPtr` is exclusive. If exclusivity is not +local and obvious, mutate the owning handle and write it back: + +```cpp +auto column = IColumn::mutate(std::move(block.get_by_position(i).column)); +// mutate column +block.replace_by_position(i, std::move(column)); +``` + +For hot paths that append on every row, do not call `mutate()` per row. Keep a +real mutable owner such as `MutableBlock`/`MutableColumnPtr`, then materialize a +`Block` view only at the boundary. + +## Scan Commands + +Active call-site inventory: + +```bash +rg -n '\bassume_mutable(_ref)?\s*\(' be/src -S +``` + +Current result: 127 raw matches. These include API definitions, comments, and +the active call sites listed below. The current unique path list from the command +is fully covered by the table in this document. + +High-risk alias scan: + +```bash +rg -n 'get_columns\(\).*assume_mutable|assume_mutable\(\).*get_columns|const_cast<.*IColumn.*>\([^\n]*get_columns|get_columns\(\)\[[^\n]*\]\.get\(\)' be/src be/test -S +``` + +Current result: no remaining direct hit. This pattern matters because +`Block::get_columns()` copies `ColumnPtr`s and can introduce a temporary alias in +the same function before the mutable assertion. + +The direct owner-slot pattern was also scanned: + +```bash +rg -n '\.get_by_position\(.*\)\.column->assume_mutable' be/src -S +``` + +Those calls are not automatically safe; each one is classified below by its +real ownership evidence. + +## Lessons From The 2026-05-09 Recheck + +- The earlier file-level `OWNED_BLOCK` classification was too coarse. A current + scanner output block is not itself a proof if the same function first copies + its `ColumnPtr`s. +- `be/src/format/json/new_json_reader.cpp` had exactly that bug, confirmed by + the `test_hive_openx_json` external regression on the OpenX + `ignore.malformed.json=true` table: + the malformed-json helper iterated `block.get_columns()`, creating aliases, + then called `assume_mutable()`. It is now `append_null_for_malformed_json()`, + which mutates each owner slot and writes it back. Rollback paths now use + `truncate_block_to_rows()` with the same owner-writeback pattern. +- `be/src/exec/rowid_fetcher.cpp` had two unsafe patterns: appending into an + externally supplied output block with `assume_mutable()`, and writing into + `result_block.get_columns()` through `const_cast`. Both were changed to mutate + the owning slot and write back. Source scan-block columns are copied only into + stable read-only vectors. +- `be/src/core/block/block.cpp` should not be described as `OWNED_BLOCK`. + These helpers are safe because they branch on `is_exclusive()` or explicitly + clone/mutate before writeback. + +## Classification Legend + +- `LOCAL_RESULT`: the column is created locally or freshly cloned before the + call and is not published before mutation. +- `MUTATED_FIRST`: the owning `ColumnPtr` was moved through `IColumn::mutate()` + or another COW-safe producer before typed mutable access. +- `OWNED_OUTPUT`: the call writes into a scanner/operator/internal output block + whose caller contract is exclusive ownership, and this function does not + introduce another local alias before the call. +- `SUBCOLUMN_EXCLUSIVE`: the parent complex column is already exclusive, so its + nested columns are exclusive as part of the parent mutation path. +- `EXCLUSIVE_BRANCH`: the call is guarded by `is_exclusive()` or `use_count()==1` + and has a clone/mutate fallback for the shared case. +- `HELPER_CONTRACT`: API/helper accessors. Callers must prove ownership or use a + COW-safe mutate/writeback path. +- `COMMENT_ONLY`: not an active call site. +- `CHECKLIST_ONLY`: local checklist text, not code. + +## Active Call-Site Audit + +| File | Lines | Classification | Evidence / action | +| --- | --- | --- | --- | +| `be/src/core/AGENTS.md` | 18 | CHECKLIST_ONLY | Local review checklist. | +| `be/src/core/cow.h` | 312, 319, 326, 349, 355 | HELPER_CONTRACT | COW primitive API definitions and proxy operators. | +| `be/src/core/column/column_nullable.h` | 283, 391 | HELPER_CONTRACT | Mutable nested/null-map accessors assert subcolumn exclusivity. Callers that only own the parent through a shared `ColumnPtr` must mutate/write back first. | +| `be/src/core/column/column_array.cpp` | 66 | COMMENT_ONLY | Documents const access to avoid mutable assertion during construction. | +| `be/src/core/column/column_map.cpp` | 553 | COMMENT_ONLY | Documents const access after offsets writeback. | +| `be/src/core/column/column_nullable.cpp` | 380 | COMMENT_ONLY | Documents const nested access to avoid mutable assertion. | +| `be/src/core/block/block.cpp` | 659, 674, 737, 745, 775, 804, 823, 1106 | EXCLUSIVE_BRANCH | In-place block helpers clear/filter/shrink in place only when the column is exclusive; otherwise they clone, filter-return, or mutate/write back. This is the correct pattern for shared blocks. | + +| File | Lines | Classification | Evidence / action | +| --- | --- | --- | --- | +| `be/src/exprs/vruntimefilter_wrapper.cpp` | 126 | LOCAL_RESULT | `filter_column` is the runtime-filter result column passed to `change_null_to_true`. | +| `be/src/exprs/vtopn_pred.h` | 121 | LOCAL_RESULT | `result_column` is a freshly produced predicate column. | +| `be/src/exprs/vexpr_context.cpp` | 332, 375 | LOCAL_RESULT | Temporary expression result columns are cleared before reuse in the expression context. | +| `be/src/exprs/lambda_function/varray_sort_function.cpp` | 145 | OWNED_OUTPUT | The lambda block is built for the current lambda evaluation and no local `ColumnPtr` alias is introduced before mutation. | +| `be/src/exprs/lambda_function/varray_map_function.cpp` | 233, 242 | OWNED_OUTPUT | Lambda evaluation columns are local to the lambda block for this call. | +| `be/src/exprs/function/function_other_types_to_date.cpp` | 150, 154, 164, 168, 301, 305, 545, 615, 620, 1058, 1133 | LOCAL_RESULT | Function result columns are created by the execute path and filled before being returned. | +| `be/src/exprs/function/cast/cast_to_variant.h` | 97, 109, 121, 124 | LOCAL_RESULT | `col_to` is created by this cast path before defaults/null wrapping are inserted. | +| `be/src/exprs/function/function_variant_element.cpp` | 266, 268, 293 | LOCAL_RESULT | `result` is the newly created variant extraction output. | +| `be/src/exprs/function/function_variadic_arguments.h` | 64, 68, 74 | LOCAL_RESULT | `column` is created locally and assigned to the result only after writes complete. | +| `be/src/exprs/function/dictionary_util.h` | 63 | EXCLUSIVE_BRANCH | In-place filter is used only after `column->is_exclusive()`; otherwise the function replaces `column` with the clone-returning `filter()` result. | +| `be/src/exprs/function/array/function_array_with_constant.cpp` | 102 | LOCAL_RESULT | `clone` comes from `value->clone_empty()` and is filled before publication. | +| `be/src/exprs/function/array/function_array_aggregation.cpp` | 219, 231, 237, 443, 456, 462 | LOCAL_RESULT | Aggregate result column is the local destination passed by the array aggregation function. | +| `be/src/exprs/aggregate/aggregate_function_null_v2.h` | 217, 276 | LOCAL_RESULT | Destination nullable columns are newly created serialize/aggregate output. A former read-only source nested-column assertion was removed in this branch. | + +| File | Lines | Classification | Evidence / action | +| --- | --- | --- | --- | +| `be/src/exec/operator/operator.cpp` | 351, 358 | EXCLUSIVE_BRANCH | Projection helper steals/mutates input columns only after checking the source is exclusive; otherwise it materializes a replacement. | +| `be/src/exec/operator/aggregation_source_operator.cpp` | 549 | LOCAL_RESULT | `ptr = make_nullable(ptr, ...)` creates the nullable result before moving it into the output column list. | +| `be/src/exec/operator/distinct_streaming_aggregation_operator.cpp` | 210, 243 | LOCAL_RESULT | Key columns are locally materialized aggregate output columns. | +| `be/src/exec/operator/hashjoin_build_sink.cpp` | 189, 591 | LOCAL_RESULT | Join build helper constructs nullable columns locally before null-map mutation. | +| `be/src/exec/operator/join/process_hash_table_probe_impl.h` | 883 | OWNED_OUTPUT | Hash-join probe writes into its current output block. No `get_columns()` alias is introduced in this mutation path. | +| `be/src/exec/operator/nested_loop_join_probe_operator.h` | 52 | OWNED_OUTPUT | Macro clears local nested-loop join probe output columns. | +| `be/src/exec/operator/assert_num_rows_operator.cpp` | 94 | LOCAL_RESULT | Assertion operator creates and fills its output column locally. | +| `be/src/exec/sort/sorter.cpp` | 235, 240 | OWNED_OUTPUT | Sorter appends into its internal unsorted block. This is a long-lived sorter-owned block, not a per-row mutate path. | +| `be/src/exec/common/util.hpp` | 248 | SUBCOLUMN_EXCLUSIVE | Recursive helper receives a `MutableColumnPtr`; the `ColumnConst` wrapper is already mutable, so its data column is part of the same exclusive path. | +| `be/src/exec/rowid_fetcher.cpp` | 464 | LOCAL_RESULT | `result_block` is built locally from `Block(slots, request.row_locs().size())` before rowid reads. Former shared-output merge and `get_columns()`/`const_cast` paths were fixed to mutate/write back. | + +| File | Lines | Classification | Evidence / action | +| --- | --- | --- | --- | +| `be/src/format/json/new_json_reader.cpp` | 1016, 1022, 1062, 1500, 1610, 1625 | OWNED_OUTPUT | Normal JSON write/skip-bitmap paths append to the current scanner output block. The function does not copy the owner column before these calls. The previous malformed/rollback paths were changed to owner mutate/writeback helpers. | +| `be/src/format/json/new_json_reader.cpp` | 1270, 1278 | SUBCOLUMN_EXCLUSIVE | Map keys/values are subcolumns of the current exclusive map column passed into `_simdjson_write_data_to_column()`. | +| `be/src/format/column_type_convert.cpp` | 117 | LOCAL_RESULT | `_cached_src_column` is converter-owned cache state and is cleared before reuse. | +| `be/src/format/parquet/vparquet_column_reader.cpp` | 334, 377, 416, 424, 665, 673, 721, 729, 798, 806, 995 | MUTATED_FIRST | Parquet column readers mutate the destination handle or operate on reader-owned destination columns before typed mutable access. | +| `be/src/format/parquet/vparquet_column_reader.h` | 486 | OWNED_OUTPUT | Nested map reader destination column is owned by the active parquet read path. | +| `be/src/format/parquet/vparquet_group_reader.cpp` | 1061, 1071 | LOCAL_RESULT | Dictionary temporary block is local to the group reader. | +| `be/src/format/parquet/vparquet_reader.h` | 190 | OWNED_OUTPUT | TopN rowid synthesized column writes into the current parquet reader output block. | +| `be/src/format/parquet/parquet_column_convert.h` | 199, 239, 350 | MUTATED_FIRST | Conversion helpers mutate owning destination handles or converter-owned columns before typed access. The nullable null-map slice bug was fixed separately by copying from the appended source slice. | +| `be/src/format/parquet/parquet_column_convert.cpp` | 121 | LOCAL_RESULT | `_cached_src_physical_column` is converter-owned cache state. | +| `be/src/format/orc/vorc_reader.h` | 229 | OWNED_OUTPUT | TopN rowid synthesized column writes into the current ORC reader output block. | +| `be/src/format/orc/vorc_reader.cpp` | 2254 | MUTATED_FIRST | Schema-change conversion operates on the destination column for the current conversion path. | +| `be/src/format/orc/vorc_reader.cpp` | 3092, 3100 | LOCAL_RESULT | Dictionary temporary block is local to the ORC reader. | +| `be/src/format/table/table_format_reader.h` | 71, 106 | OWNED_OUTPUT | Partition/missing columns are filled into the current scanner output block. No local `get_columns()` alias is introduced. | +| `be/src/format/table/table_format_reader.h` | 113 | EXCLUSIVE_BRANCH | Default expression result is mutated only when `use_count()==1`; the shared case is not passed to `assume_mutable()`. | +| `be/src/format/table/es/es_http_reader.cpp` | 153 | OWNED_OUTPUT | ES reader materializes directly into the current output block column slots. | +| `be/src/format/table/iceberg_reader_mixin.h` | 162, 183 | OWNED_OUTPUT | Position delete helper writes synthesized columns into the current delete/output block. The old equality-delete `MutableBlock(&block)` missing-writeback bug has no remaining `assume_mutable()` call and is handled by `to_block()` writeback. | + +| File | Lines | Classification | Evidence / action | +| --- | --- | --- | --- | +| `be/src/storage/schema_change/schema_change.cpp` | 176, 183, 233, 383, 408 | LOCAL_RESULT | New schema-change columns/blocks are created locally before insertion or conversion. | +| `be/src/storage/partial_update_info.cpp` | 1010 | LOCAL_RESULT | `tmp_block` is freshly created for the sequence column before moving it into the target block. | +| `be/src/storage/segment/vertical_segment_writer.cpp` | 887, 890 | LOCAL_RESULT | Encoded default sequence value block is created locally and filled immediately. | +| `be/src/storage/segment/segment_iterator.h` | 269 | OWNED_OUTPUT | Segment iterator writes selected rows into the caller-provided read output block. This path does not create a local `ColumnPtr` alias before mutation; type-cast branch writes through `cast_column()` replacement instead. | +| `be/src/storage/segment/segment_iterator.cpp` | 2647, 2650 | LOCAL_RESULT | `_current_return_columns` is iterator-owned current batch state; converted columns replace that state after `cast_column()`. | +| `be/src/storage/segment/virtual_column_iterator.cpp` | 157 | LOCAL_RESULT | `res_col` is produced by `filter()` and immediately becomes the destination column. | +| `be/src/storage/iterator/vcollect_iterator.cpp` | 422 | LOCAL_RESULT | A clone-empty column is created from the source block before being pushed into the local mutable block. | +| `be/src/storage/iterator/vgeneric_iterators.cpp` | 174 | OWNED_OUTPUT | Merge iterator appends into its destination block for the current read. No local alias is introduced. | +| `be/src/storage/iterator/vertical_merge_iterator.cpp` | 330, 353 | OWNED_OUTPUT | Vertical merge appends into its destination block for the current read. No local alias is introduced. | +| `be/src/storage/iterator/olap_data_convertor.h` | 184 | LOCAL_RESULT | Padding column is the local conversion destination. | +| `be/src/storage/segment/variant/variant_column_writer_impl.cpp` | 1224 | COMMENT_ONLY | Documents avoiding a repeated mutable assertion. | +| `be/src/storage/segment/variant/variant_column_reader.cpp` | 1535 | COMMENT_ONLY | Commented-out code only. | + +## Mutate / MutableBlock Recheck + +Baseline command: + +```bash +rg -n -C 3 'MutableBlock::build_mutable_block\(|MutableBlock\s+\w+\s*\(&' be/src -S +``` + +Current conclusions: + +- `be/src/exprs/aggregate/aggregate_function_sort.h`: aggregate sort state owns + a long-lived `MutableBlock`. `add()` and `merge()` append directly to mutable + columns; `Block` is materialized only for `serialize()` and `sort_block()`. + This is the correct hot-path pattern. +- `be/src/format/table/iceberg_reader_mixin.h`: equality-delete cache merge now + writes back with `eq_file_block = mutable_block.to_block()`. +- `be/src/information_schema/*_scanner.cpp`, scanner helpers, group commit, + partial update, and tablet helper paths write back with `set_columns()`, + `swap(...to_block())`, or equivalent owner replacement. +- `be/src/load/memtable/memtable.cpp`, iterator setup, and hash/set build paths + use local clones or long-lived mutable owners; moved-from local blocks are not + read afterward. + +No second definite `MutableBlock(&block)` missing-writeback bug was found in the +current scan. + +Owner-slot mutate command: + +```bash +rg -n -C 2 'IColumn::mutate\(std::move\([^\n]*(block|Block|_result_block|result_block|out_block|output_block|in_block|tmp_block).*get_by_position' be/src -S +``` + +Current conclusions: + +- Direct helper cases such as `schema_scanner_helper.cpp`, + `schema_scanner.cpp`, skip-bitmap helpers, rowid fetcher, point query, and + file-reader resize paths mutate the owner slot and then write it back with + `replace_by_position()` or direct owner assignment. +- Mem-reuse operator paths that move several columns out at once collect them + into `MutableColumns` and then restore the block with `set_columns()` or + `swap(Block(...))`. +- Long-lived mutable-owner paths such as set-source local state deliberately + move the block columns into state-owned mutable columns; the moved-from block + is not read as the owner afterward in that function. +- No definite `IColumn::mutate(std::move(block->get_by_position(...).column))` + site was found where the mutated owner is later forgotten. + +## Hot-Path Mutate Audit + +Baseline commands: + +```bash +rg -n '\bIColumn::mutate\s*\(|\.mutate\s*\(\)' be/src -S +rg -n '(->|\.)mutate_columns\s*\(' be/src -S +rg -n '\bIColumn::mutate\s*\(|\.mutate\s*\(\)|(->|\.)mutate_columns\s*\(' be/src -S +``` + +Current result: 171 `IColumn::mutate()` / `std::move(*column).mutate()` +matches, plus 47 `Block::mutate_columns()` matches. `be/src/exprs/aggregate` +has no remaining COW `mutate()` matches; aggregate `add()` hot paths therefore +do not mutate per row. `AggregateFunctionSortData` remains the intended model: +keep a long-lived `MutableBlock`, append directly in `add()`, and materialize a +`Block` view only when sorting or serializing. + +The only definite hot-path issue found in this pass was JSONB row-store +deserialization: + +- `JsonbSerializeUtil::jsonb_to_block(char*)` was a single-row helper that + moved/mutated destination block columns per JSONB field. +- It is called from point query and rowid fetcher row loops. +- It now exposes `jsonb_to_columns(...)`, so hot callers mutate the destination + block once into `MutableColumns` outside the row loop and append rows through + those mutable owners. +- The old single-row `jsonb_to_block(char*)` wrapper remains for non-hot callers, + but current hot call sites use `jsonb_to_columns(...)`. + +`be/src/format/json/new_json_reader.cpp` also keeps Hive duplicate-key rollback +on an owner-slot mutate/writeback helper. This branch only executes after the +same key has already been written once in the current JSON object, so it is not +part of the normal per-column write path, and it avoids relying on a scanner +output-block ownership assumption across a rollback/rewrite operation. + +Detailed grouped audit: + +| File | Lines | Hot-path conclusion | +| --- | --- | --- | +| `be/src/core/block/block.cpp` | 584, 1108 | Block-level helper. `mutate_columns()` is the API boundary; shrink path mutates only after exclusivity branch. Not row-by-row. | +| `be/src/core/block/block.h` | 392, 396, 529 | `MutableBlock` constructors and const-column recursion helper. Block-level ownership transfer, not row-by-row. | +| `be/src/core/block/column_with_type_and_name.cpp` | 131 | Column wrapper conversion helper. Not row-by-row. | +| `be/src/core/column/column.h` | 191, 586, 596 | Column COW helper implementations. API boundary. | +| `be/src/core/column/column_const.cpp` | 113 | Const-column internal mutation of owned data. Not row-by-row. | +| `be/src/core/column/column_const.h` | 298, 326 | Const-column internal materialization helpers. Not row-by-row. | +| `be/src/core/column/column_map.cpp` | 522, 523, 526, 529, 539, 540, 541, 560, 563, 566, 662, 665 | Map internal filter helpers mutate subcolumns once per column operation. Recursive value-map dedup detaches the value owner before mutation. Not row-by-row. | +| `be/src/core/column/column_map.h` | 65, 66 | Map shared-column factory preserves immutable subcolumns and validates through const access. Not row-by-row. | +| `be/src/core/column/column_nullable.cpp` | 118 | Nullable constructor internal subcolumn ownership. Not row-by-row. | +| `be/src/core/column/column_nullable.h` | 68, 70 | Nullable shared-column factory preserves immutable subcolumns and validates through const access. Not row-by-row. | +| `be/src/core/column/column_variant.cpp` | 319, 487, 495, 502, 504, 2071, 2129, 2348, 2356, 2816, 2836, 2837 | Variant internal finalize/filter/serialization helpers mutate subcolumns during a column operation. Not row-by-row COW. | +| `be/src/core/column/column_variant.h` | 328, 444 | Variant helper/finalize path. Not row-by-row. | +| `be/src/core/cow.h` | 71 | COW primitive example/comment path. API boundary. | +| `be/src/core/data_type/data_type_array.cpp` | 123 | Array type helper mutates nested column once. Not row-by-row. | +| `be/src/core/data_type/data_type_map.cpp` | 138, 139 | Map type helper mutates key/value nested columns once. Not row-by-row. | +| `be/src/core/data_type/data_type_struct.cpp` | 217 | Struct type helper mutates children once per column operation. Not row-by-row. | +| `be/src/exprs/function/array/function_array_utils.cpp` | 64 | Function execution block-level variant/nullable conversion. Not row-by-row. | +| `be/src/exprs/function/cast/cast_to_variant.h` | 41, 44, 170 | Cast execution mutates result/source once per vectorized block. Not aggregate-row hot. | +| `be/src/exprs/function/comparison_equal_for_null.cpp` | 194, 233 | Temporary block result extraction. Not row-by-row. | +| `be/src/exprs/function/function.cpp` | 70 | Function null-map merge mutates result null-map once per execute. Not row-by-row. | +| `be/src/exprs/function/function_bitmap.cpp` | 684 | Bitmap function mutates a local/source column once per vectorized execute. Not row-by-row. | +| `be/src/exprs/function/function_variant_element.cpp` | 325 | Variant element execution creates mutable root once. Not row-by-row. | +| `be/src/exprs/function/if.cpp` | 252, 283 | IF function reuses one selected result column per vectorized execute. Not row-by-row. | +| `be/src/exprs/table_function/udf_table_function.cpp` | 127 | UDF table-function result column ownership transfer once per produced block. Not row-by-row. | +| `be/src/exprs/table_function/vexplode.cpp` | 48 | Table-function variant column conversion once per input column. Not row-by-row. | +| `be/src/exprs/table_function/vexplode_v2.cpp` | 54 | Same as `vexplode.cpp`. | +| `be/src/exprs/vcompound_pred.h` | 212, 233, 237 | Compound predicate reuses one input/result column per vectorized execute. Not row-by-row. | +| `be/src/exprs/vcondition_expr.cpp` | 206, 235 | CASE/condition expression reuses selected result column per vectorized execute. Not row-by-row. | +| `be/src/exec/common/arrow_column_to_doris_column.cpp` | 103 | Arrow conversion mutates the destination column once per converted column. Not row-by-row. | +| `be/src/exec/common/data_gen_functions/vnumbers_tvf.cpp` | 52 | Data-gen source mutates output columns once per output block when memory is reused. Not row-by-row. | +| `be/src/exec/common/partition_sort_utils.cpp` | 32 | Partition-sort utility converts one stored block to mutable columns before appending. Not row-by-row. | +| `be/src/exec/common/variant_util.cpp` | 438, 2157, 2219, 2222 | Variant utility column conversion/finalization paths. Not row-by-row COW. | +| `be/src/exec/exchange/vdata_stream_sender.cpp` | 332 | Exchange sender keeps a mutable block owner while serializing/sending one block. Not row-by-row. | +| `be/src/exec/operator/aggregation_sink_operator.cpp` | 311, 500 | Aggregation sink normalizes key float values once per input block/key column. Not aggregate `add()`. | +| `be/src/exec/operator/aggregation_source_operator.cpp` | 116, 141, 246, 304, 313 | Aggregation source output path mutates reused output columns once per output batch. Not row-by-row. | +| `be/src/exec/operator/bucketed_aggregation_sink_operator.cpp` | 179 | Bucketed aggregation sink normalizes key float values once per input block/key column. Not aggregate `add()`. | +| `be/src/exec/operator/bucketed_aggregation_source_operator.cpp` | 332, 387, 475, 558 | Bucketed aggregation source mutates reused output columns once per output batch. Not row-by-row. | +| `be/src/exec/operator/dict_sink_operator.cpp` | 47 | Sink block column overflow conversion once per column. Not row-by-row. | +| `be/src/exec/operator/distinct_streaming_aggregation_operator.cpp` | 166, 220, 225, 232 | Distinct streaming aggregation mutates expression/output/cache columns once per processed block/split. Not row-by-row. | +| `be/src/exec/operator/exchange_sink_operator.cpp` | 513 | Exchange sink transfers current block into mutable block for serialization. Not row-by-row. | +| `be/src/exec/operator/hashjoin_build_sink.cpp` | 575, 577 | Hash-join build converts/finalizes one block column before build. Not row-by-row. | +| `be/src/exec/operator/join/process_hash_table_probe_impl.h` | 168, 657, 726 | Hash-join probe mutates output/lazy materialized columns once per output block. Not row-by-row. | +| `be/src/exec/operator/nested_loop_join_probe_operator.cpp` | 82, 104, 145, 401, 515 | Nested-loop join probe creates mutable output columns once per output block/batch section. Not row-by-row. | +| `be/src/exec/operator/partitioned_aggregation_sink_operator.cpp` | 515, 516 | Partitioned aggregation state owns key/value mutable blocks for later appends. Correct long-lived owner pattern. | +| `be/src/exec/operator/schema_scan_operator.cpp` | 261 | Schema scan copies source columns into output columns once per block. Low-volume metadata path. | +| `be/src/exec/operator/set_sink_operator.cpp` | 134 | Set sink overflow conversion once per block column. Not row-by-row. | +| `be/src/exec/operator/set_source_operator.cpp` | 117 | Set source transfers output block columns into local mutable columns once per block. Not row-by-row. | +| `be/src/exec/operator/streaming_aggregation_operator.cpp` | 334, 376, 405, 472, 496, 599 | Streaming aggregation source/sink output paths mutate reused block columns once per block/output batch. Not aggregate `add()`. | +| `be/src/exec/rowid_fetcher.cpp` | 167, 1082 | Fixed in this pass: row-store JSONB loops now mutate result columns once outside the row loop and append through `jsonb_to_columns(...)`. | +| `be/src/exec/rowid_fetcher.cpp` | 196, 943, 1103 | Non-row-store merge/read paths mutate once per destination column, then append many rows. Not row-by-row. | +| `be/src/exec/scan/file_scanner.cpp` | 441, 785 | File scanner mutates partition-prune/skip-bitmap helper columns once per block. Not row-by-row. | +| `be/src/exec/scan/meta_scanner.cpp` | 115 | Meta scanner output columns once per block. Low-volume metadata path. | +| `be/src/exec/scan/scanner.cpp` | 219 | Scanner materialization mutates a prepared column pointer once per projection column. Not row-by-row. | +| `be/src/exec/sink/vtablet_block_convertor.cpp` | 285, 289 | Tablet sink conversion mutates temporary/result columns once per block. Not row-by-row. | +| `be/src/exec/sink/writer/vtablet_writer.cpp` | 1765 | Restores a temporary block after merge. Not row-by-row. | +| `be/src/exec/sink/writer/vtablet_writer_v2.cpp` | 625 | Same as v1 writer. | +| `be/src/format/arrow/arrow_stream_reader.cpp` | 97 | Arrow stream reader mutates output block once per batch. Not row-by-row. | +| `be/src/format/count_reader.h` | 61 | Count reader creates/mutates output columns once per batch. Not row-by-row. | +| `be/src/format/csv/csv_reader.cpp` | 446, 452 | CSV reader mutates output columns once per batch when filling rows. Not row-by-row. | +| `be/src/format/jni/jni_data_bridge.cpp` | 108 | JNI bridge mutates one destination column before writing a batch. Not row-by-row. | +| `be/src/format/json/new_json_reader.cpp` | 462, 472, 482 | Malformed-row append, row rollback, and Hive duplicate-key rollback use owner-slot mutate/writeback helpers. The duplicate-key helper is a rare rollback path, not the normal per-field write path. | +| `be/src/format/lance/lance_rust_reader.cpp` | 233 | Lance reader mutates output block once per batch. Not row-by-row. | +| `be/src/format/orc/vorc_reader.cpp` | 2055, 2073, 2145, 2216, 2218, 2253, 2263, 2860 | ORC complex/schema conversion and block resize paths mutate once per column/batch. Not row-by-row. | +| `be/src/format/parquet/parquet_column_convert.h` | 198, 238, 346 | Parquet conversion helpers mutate destination handles once per converted column. Not row-by-row. | +| `be/src/format/parquet/vparquet_column_reader.cpp` | 331, 413, 663, 719, 796, 994 | Parquet column reader mutates destination handles once per column chunk/batch. Not row-by-row. | +| `be/src/format/parquet/vparquet_column_reader.h` | 485 | Nested parquet column reader mutates destination once per read call. Not row-by-row. | +| `be/src/format/parquet/vparquet_group_reader.cpp` | 668 | Parquet group reader temporary resize once per block. Not row-by-row. | +| `be/src/format/table/paimon_cpp_reader.cpp` | 77, 120 | Paimon reader mutates output columns once per batch. Not row-by-row. | +| `be/src/format/table/paimon_jni_reader.cpp` | 108 | Paimon JNI reader mutates output columns once per batch. Not row-by-row. | +| `be/src/format/table/parquet_metadata_reader.cpp` | 815 | Metadata reader output reuse once per batch. Low-volume metadata path. | +| `be/src/format/table/remote_doris_reader.cpp` | 75 | Remote Doris reader mutates output columns once per batch. Not row-by-row. | +| `be/src/format/transformer/merge_partitioner.cpp` | 213 | Merge partitioner mutates a block once before partitioning. Not row-by-row. | +| `be/src/information_schema/schema_scanner.cpp` | 104, 314, 473 | Information-schema insertion helpers can run per cell, but this is a metadata path, not BE data hot path. They write back correctly. | +| `be/src/information_schema/schema_scanner_helper.cpp` | 36, 47, 59, 75, 85, 95, 105 | Same metadata-path per-cell helper pattern as `schema_scanner.cpp`; writeback is present. | +| `be/src/runtime/result_block_buffer.cpp` | 217 | Result buffer merges/mutates one block when appending query results. Not row-by-row. | +| `be/src/service/point_query_executor.cpp` | 503 | Fixed in this pass: point query now mutates result columns once outside the row loop and appends row-store/missing-column values through those mutable owners. | +| `be/src/storage/iterator/block_reader.cpp` | 171, 347, 480, 537, 587 | Storage reader prepares target/delete-filter columns per batch. Not row-by-row. | +| `be/src/storage/iterator/olap_data_convertor.h` | 310, 314 | OLAP convertor captures column data once per conversion batch. Not row-by-row. | +| `be/src/storage/iterator/vcollect_iterator.cpp` | 881 | Collect iterator mutates target columns once per batch. Not row-by-row. | +| `be/src/storage/iterator/vertical_block_reader.cpp` | 190, 401, 487, 488, 555 | Vertical reader prepares target/delete-filter columns per batch. Not row-by-row. | +| `be/src/storage/iterator/vgeneric_iterators.cpp` | 67 | Generic iterator mutates output columns once per batch. Not row-by-row. | +| `be/src/storage/partial_update_info.cpp` | 45, 342, 389, 418, 497, 565 | Partial-update block construction/merge mutates columns once per block. Not row-by-row. | +| `be/src/storage/segment/column_reader.cpp` | 997, 1012, 1013, 1084, 1169, 1170, 1417, 1782, 1794 | Segment complex-column readers mutate offsets/items/subcolumns once per read batch. Not row-by-row. | +| `be/src/storage/segment/segment_iterator.cpp` | 2185, 2907 | Segment iterator mutates current return columns / temporary mock column once per batch. Not row-by-row. | +| `be/src/storage/segment/variant/hierarchical_data_iterator.cpp` | 206, 228, 249, 290, 546 | Variant hierarchical reader mutates subcolumns once per read/finalize batch. Not row-by-row. | +| `be/src/storage/segment/variant/hierarchical_data_iterator.h` | 141 | Variant iterator helper mutates destination once per helper call. Not row-by-row. | +| `be/src/storage/segment/variant/variant_column_writer_impl.cpp` | 1229 | Variant writer finalization helper. Not row-by-row. | +| `be/src/storage/segment/variant/variant_streaming_compaction_writer.cpp` | 146 | Variant streaming compaction finalization helper. Not row-by-row. | +| `be/src/storage/segment/vertical_segment_writer.cpp` | 96 | Skip-bitmap helper mutates one block column then writes back. Not row-by-row. | +| `be/src/storage/tablet/base_tablet.cpp` | 962, 986, 1194 | Tablet row reconstruction/partial update mutates full output columns once per block. Not row-by-row. | +| `be/src/storage/tablet_info.cpp` | 563 | Partition-key helper mutates a temporary column once. Not row-by-row. | +| `be/src/util/jsonb/serialize.cpp` | 83, 159 | Wrapper paths now mutate destination columns once, call `jsonb_to_columns(...)`, and restore with `set_columns()`. Known hot callers bypass the single-row wrapper and hold `MutableColumns` across the row loop. | + +## Tests Added For Real COW Violations + +- `NewJsonReaderCowTest.AppendNullForMalformedJsonMutatesOwnerColumn` builds a + nullable column with an extra `ColumnPtr` alias, calls the malformed-json + helper, and verifies the block receives a new mutated owner while the original + shared column remains unchanged. +- `NewJsonReaderCowTest.TruncateBlockToRowsMutatesOwnerColumn` builds a shared + two-row nullable column, truncates the block, and verifies the original alias + still has two rows. +- `NewJsonReaderCowTest.PopBackLastInsertedValueMutatesOwnerColumn` builds a + shared destination column, removes the last inserted value through the JSON + rollback helper, and verifies the block gets a mutated owner while the + original alias still has both rows. +- `BlockSerializeCowTest.JsonbToBlockMutatesDestinationOwnerColumn` builds a + shared destination column, decodes JSONB rows, and verifies the destination + block gets its own mutated owner while the original shared column remains + empty. + +These tests cover the exact alias mode that external JSON regression exposed: +mutating a block-owned column while another `ColumnPtr` reference to the same +column still exists. + +## Fixed During This Audit Series + +- `AggregateFunctionSortData::{add,merge}`: removed hot-path + `assume_mutable()` calls by making the aggregate state own a `MutableBlock`. +- `AggregateFunctionNullUnary::streaming_agg_serialize_to_column`: replaced a + read-only source nested-column mutable assertion with const access. +- ORC schema-change nullable converter: `align_orc_null_map` now copies from the + appended source null-map slice instead of offset `0`. +- Parquet schema-change nullable converter: logical-source null maps with an old + destination prefix now copy from the appended logical-source slice. +- `ColumnArray::create(ColumnPtr...)`: shared-column construction keeps immutable + subcolumns shared but now reuses the same const-safe offset type and + nested-size validation as the mutable constructor. +- `ColumnNullable::create(ColumnPtr...)` and `ColumnMap::create(ColumnPtr...)`: + shared-column construction no longer deep-mutates input subcolumns, avoiding + unnecessary clones in block wrapping paths while keeping invariant checks. +- `ColumnMap::deduplicate_keys(true)`: recursive nested-map value dedup now + detaches and writes back the value owner instead of const-casting through a + shared nullable/value subcolumn. +- `ColumnMap::filter(const Filter&, ...)` and `ColumnMap::permute(...)`: these + return new columns and therefore keep input subcolumns shared/const instead of + pre-cloning whole key/value/offset columns. +- Variant materialization for nullable scalar variants: the nested variant is + taken from the already-detached nullable owner, so root finalization/conversion + cannot mutate aliases of the original nullable wrapper. +- JSON malformed/rollback paths: changed from `get_columns()` plus + `assume_mutable()` to owner-slot mutate/writeback helpers with focused BE UTs. +- Rowid fetcher merge and external-row readback: changed shared-output mutation + and `get_columns()`/`const_cast` destination writes to owner-slot + mutate/writeback. +- JSONB row-store decode: changed point-query and rowid-fetcher row loops from + per-row/per-field block mutation to a `MutableColumns` owner held across the + loop, with a COW unit test for shared destination columns. From 47a8eaac40bfb2d91e89f030f76de65a7250eb6b Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Sun, 17 May 2026 23:08:54 +0800 Subject: [PATCH 02/11] [refactor](be) Strengthen scoped COW mutation APIs Issue Number: None Related PR: #63001 Problem Summary: The BE COW refactor makes assume_mutable an ownership assertion, but several live Block paths still expressed mutation by manually moving ColumnPtr or MutableBlock owners out and writing them back. That made it easy to leave a Block with moved-from columns on RETURN_IF_ERROR, or to reintroduce hot-path per-row mutate calls. This change codifies the new pattern: std::move(block).mutate_columns() is the stealing API for throwaway/rvalue Blocks; block.mutate_columns_scoped() is the RAII API for temporary whole-block mutation and restores on every exit path; block.mutate_column_scoped(pos) is the RAII owner-slot API for modifying one live Block column; VectorizedUtils::build_scoped_mutable_mem_reuse_block() returns a scoped guard, so callers first hold the guard and then borrow MutableBlock. The patch removes default ScopedMutableBlock construction, avoids copying schema in ScopedMutableColumns, migrates table/orc/parquet/es readers and executor paths to the scoped APIs, and keeps hot row loops on MutableBlock/MutableColumns instead of repeated mutate(). Tests exercise early-return restore, shared-column detach, null-slot recreation, live schema access, LocalExchanger error paths, and TableFormatReader partition/missing-column COW contracts. None - Test: - Unit Test: ./run-be-ut.sh --run --filter=BlockTest.ScopedMutableColumnsRestoreOnErrorAndDetachSharedColumn:BlockTest.ScopedMutableColumnsReadSchemaFromLiveBlock:BlockTest.ScopedMutableColumnRestoreOnErrorDetachSharedAndCreateMissingColumn:BlockTest.ScopedMutableBlockRestoreOnErrorAndDetachSharedColumn:LocalExchangerTest.ShuffleExchangerRestoreOutputBlockOnAddRowsError:TableFormatReaderTest.FillPartitionColumnRestoresSharedColumnOnDeserializeError:TableFormatReaderTest.FillMissingNullableColumnDetachesSharedBlockSlot -j 100 - Regression test: ./run-regression-test.sh --run -d external_table_p0/hive -s test_hive_openx_json - Manual test: ./build.sh --be -j 100 - Static check: git diff --check; build-support/check-format.sh; build-support/run-clang-tidy.sh --build-dir be/build_ASAN - Behavior changed: No - Does this need documentation: No --- be/src/core/block/block.cpp | 85 ++++++++++++- be/src/core/block/block.h | 120 ++++++++++++++++-- .../core/column/column_fixed_length_object.h | 4 + be/src/core/data_type/data_type_array.cpp | 4 +- be/src/core/data_type/data_type_map.cpp | 4 +- be/src/exec/common/partition_sort_utils.cpp | 14 +- be/src/exec/common/util.hpp | 16 ++- be/src/exec/exchange/local_exchanger.cpp | 45 +++---- be/src/exec/exchange/vdata_stream_sender.cpp | 2 +- .../exec/operator/cache_source_operator.cpp | 10 +- .../exec/operator/exchange_sink_operator.cpp | 2 +- .../group_commit_block_sink_operator.cpp | 2 +- be/src/exec/operator/hashjoin_build_sink.cpp | 2 +- .../exec/operator/hashjoin_probe_operator.cpp | 5 +- .../nested_loop_join_probe_operator.cpp | 16 +-- be/src/exec/operator/operator.cpp | 7 +- .../partitioned_aggregation_sink_operator.cpp | 6 +- be/src/exec/operator/repeat_operator.cpp | 12 +- be/src/exec/operator/set_sink_operator.cpp | 3 +- .../exec/operator/table_function_operator.cpp | 18 +-- be/src/exec/operator/union_sink_operator.h | 6 +- .../exec/operator/union_source_operator.cpp | 8 +- be/src/exec/rowid_fetcher.cpp | 8 +- be/src/exec/scan/file_scanner.cpp | 7 +- be/src/exec/scan/scanner.cpp | 7 +- be/src/exec/scan/scanner.h | 4 +- be/src/exec/sink/writer/vtablet_writer.cpp | 8 +- be/src/exec/sink/writer/vtablet_writer_v2.cpp | 8 +- be/src/exec/sort/partition_sorter.cpp | 17 +-- be/src/exec/sort/sorter.cpp | 5 +- be/src/exec/sort/vsorted_run_merger.cpp | 9 +- be/src/exprs/function/cast/cast_base.h | 8 ++ .../cast/cast_to_basic_number_common.h | 21 ++- be/src/exprs/function/cast/cast_to_boolean.h | 21 ++- be/src/exprs/function/cast/cast_to_date.h | 44 +++---- be/src/exprs/function/cast/cast_to_decimal.h | 21 ++- be/src/exprs/function/cast/cast_to_ip.h | 23 ++-- be/src/exprs/function/cast/cast_to_string.h | 9 +- be/src/format/arrow/arrow_stream_reader.cpp | 16 ++- be/src/format/column_type_convert.cpp | 16 +-- be/src/format/column_type_convert.h | 5 +- be/src/format/count_reader.h | 4 +- be/src/format/csv/csv_reader.cpp | 8 +- be/src/format/lance/lance_rust_reader.cpp | 12 +- be/src/format/orc/vorc_reader.cpp | 5 +- be/src/format/orc/vorc_reader.h | 3 +- .../format/parquet/fix_length_plain_decoder.h | 9 +- .../format/parquet/parquet_column_convert.cpp | 36 ++++-- .../format/parquet/parquet_column_convert.h | 62 +++++---- .../format/parquet/vparquet_group_reader.cpp | 8 +- be/src/format/parquet/vparquet_reader.cpp | 3 + be/src/format/parquet/vparquet_reader.h | 3 +- be/src/format/table/es/es_http_reader.cpp | 7 +- be/src/format/table/iceberg_reader_mixin.h | 14 +- be/src/format/table/paimon_cpp_reader.cpp | 16 ++- be/src/format/table/paimon_jni_reader.cpp | 4 +- .../format/table/parquet_metadata_reader.cpp | 29 ++--- be/src/format/table/remote_doris_reader.cpp | 14 +- be/src/format/table/table_format_reader.h | 11 +- .../format/transformer/merge_partitioner.cpp | 5 +- .../schema_active_queries_scanner.cpp | 4 +- ...ma_authentication_integrations_scanner.cpp | 4 +- .../schema_backend_active_tasks.cpp | 4 +- .../schema_backend_kerberos_ticket_cache.cpp | 4 +- ...chema_catalog_meta_cache_stats_scanner.cpp | 4 +- .../schema_database_properties_scanner.cpp | 4 +- .../schema_file_cache_statistics.cpp | 4 +- .../schema_partitions_scanner.cpp | 4 +- .../schema_role_mappings_scanner.cpp | 4 +- .../schema_sql_block_rule_status_scanner.cpp | 4 +- .../schema_table_options_scanner.cpp | 4 +- .../schema_table_properties_scanner.cpp | 4 +- ...chema_table_stream_consumption_scanner.cpp | 4 +- .../schema_table_streams_scanner.cpp | 4 +- .../schema_view_dependency_scanner.cpp | 4 +- .../schema_workload_group_privileges.cpp | 4 +- ..._workload_group_resource_usage_scanner.cpp | 4 +- .../schema_workload_groups_scanner.cpp | 4 +- .../schema_workload_sched_policy_scanner.cpp | 4 +- be/src/load/memtable/memtable.cpp | 15 ++- be/src/runtime/query_cache/query_cache.cpp | 4 +- be/src/runtime/result_block_buffer.cpp | 4 +- be/src/service/point_query_executor.cpp | 4 +- be/src/storage/iterator/block_reader.cpp | 21 ++- be/src/storage/iterator/vcollect_iterator.cpp | 12 +- .../iterator/vertical_block_reader.cpp | 22 ++-- .../storage/iterator/vgeneric_iterators.cpp | 4 +- be/src/storage/partial_update_info.cpp | 34 ++--- be/src/storage/tablet/base_tablet.cpp | 23 ++-- be/src/util/jsonb/serialize.cpp | 8 +- be/test/core/block/block_test.cpp | 104 ++++++++++++++- be/test/core/column/column_nullable_test.cpp | 11 +- be/test/core/column/common_column_test.h | 6 +- be/test/core/data_type/complex_type_test.cpp | 5 +- .../exec/common/schema_util_rowset_test.cpp | 2 +- .../exec/exchange/exchange_writer_test.cpp | 2 +- .../operator/table_function_operator_test.cpp | 6 +- .../exec/pipeline/local_exchanger_test.cpp | 87 +++++++++++++ .../aggregate/vec_count_by_enum_test.cpp | 66 +++------- .../cast/function_variant_cast_test.cpp | 51 +++++++- .../exprs/function/function_is_null_test.cpp | 4 +- .../native/native_reader_writer_test.cpp | 4 +- .../parquet/parquet_column_convert_test.cpp | 11 +- .../format/table/table_format_reader_test.cpp | 101 +++++++++++++++ .../delta_writer_cluster_key_test.cpp | 2 +- .../load/delta_writer/delta_writer_test.cpp | 4 +- .../memtable/memtable_memory_limiter_test.cpp | 2 +- be/test/runtime/snapshot_loader_test.cpp | 4 +- .../ordered_data_compaction_test.cpp | 4 +- .../compaction/segcompaction_mow_test.cpp | 28 ++-- .../storage/compaction/segcompaction_test.cpp | 30 ++--- .../variant_doc_mode_compaction_test.cpp | 2 +- .../compaction/vertical_compaction_test.cpp | 10 +- .../storage/index/date_bloom_filter_test.cpp | 4 +- be/test/storage/index/index_builder_test.cpp | 36 +++--- .../common/inverted_index_gc_binlogs_test.cpp | 2 +- .../util/index_compaction_utils.cpp | 2 +- .../iterator/block_reader_agg_flush_test.cpp | 3 +- be/test/storage/rowid_conversion_test.cpp | 2 +- .../storage/segment/segment_cache_test.cpp | 2 +- .../segments_key_bounds_truncation_test.cpp | 2 +- .../variant_column_writer_reader_test.cpp | 4 +- .../storage/tablet/tablet_cooldown_test.cpp | 2 +- 123 files changed, 1145 insertions(+), 588 deletions(-) diff --git a/be/src/core/block/block.cpp b/be/src/core/block/block.cpp index 9e3788a2adb040..887dd48a349679 100644 --- a/be/src/core/block/block.cpp +++ b/be/src/core/block/block.cpp @@ -576,7 +576,90 @@ Columns Block::get_columns_and_convert() { return columns; } -MutableColumns Block::mutate_columns() { +Block::ScopedMutableColumns::ScopedMutableColumns(Block& block) + : _block(&block), _columns(std::move(block).mutate_columns()) {} + +Block::ScopedMutableColumns::~ScopedMutableColumns() { + restore(); +} + +Block::ScopedMutableColumns::ScopedMutableColumns(ScopedMutableColumns&& other) noexcept + : _block(std::exchange(other._block, nullptr)), _columns(std::move(other._columns)) {} + +Block::ScopedMutableColumns& Block::ScopedMutableColumns::operator=( + ScopedMutableColumns&& other) noexcept { + if (this != &other) { + restore(); + _block = std::exchange(other._block, nullptr); + _columns = std::move(other._columns); + } + return *this; +} + +const DataTypePtr& Block::ScopedMutableColumns::get_datatype_by_position(size_t position) const { + DCHECK(_block != nullptr); + return _block->get_by_position(position).type; +} + +const std::string& Block::ScopedMutableColumns::get_name_by_position(size_t position) const { + DCHECK(_block != nullptr); + return _block->get_by_position(position).name; +} + +void Block::ScopedMutableColumns::restore() { + if (_block != nullptr) { + _block->set_columns(std::move(_columns)); + _block = nullptr; + } +} + +Block::ScopedMutableColumn::ScopedMutableColumn(Block& block, size_t position) + : _block(&block), _position(position) { + DCHECK_LT(_position, _block->data.size()); + auto& column_with_type_and_name = _block->data[_position]; + DCHECK(column_with_type_and_name.type); + _column = column_with_type_and_name.column + ? IColumn::mutate(std::move(column_with_type_and_name.column)) + : column_with_type_and_name.type->create_column(); +} + +Block::ScopedMutableColumn::~ScopedMutableColumn() { + restore(); +} + +Block::ScopedMutableColumn::ScopedMutableColumn(ScopedMutableColumn&& other) noexcept + : _block(std::exchange(other._block, nullptr)), + _position(other._position), + _column(std::move(other._column)) {} + +Block::ScopedMutableColumn& Block::ScopedMutableColumn::operator=( + ScopedMutableColumn&& other) noexcept { + if (this != &other) { + restore(); + _block = std::exchange(other._block, nullptr); + _position = other._position; + _column = std::move(other._column); + } + return *this; +} + +void Block::ScopedMutableColumn::restore() { + if (_block != nullptr) { + DCHECK_LT(_position, _block->data.size()); + _block->data[_position].column = std::move(_column); + _block = nullptr; + } +} + +Block::ScopedMutableColumns Block::mutate_columns_scoped() & { + return ScopedMutableColumns(*this); +} + +Block::ScopedMutableColumn Block::mutate_column_scoped(size_t position) & { + return ScopedMutableColumn(*this, position); +} + +MutableColumns Block::mutate_columns() && { size_t num_columns = data.size(); MutableColumns columns(num_columns); for (size_t i = 0; i < num_columns; ++i) { diff --git a/be/src/core/block/block.h b/be/src/core/block/block.h index 535dc0ff286309..ef05274cf636a6 100644 --- a/be/src/core/block/block.h +++ b/be/src/core/block/block.h @@ -212,8 +212,60 @@ class Block { /** Get empty columns with the same types as in block. */ MutableColumns clone_empty_columns() const; - /** Get columns from block for mutation. Columns in block will be nullptr. */ - MutableColumns mutate_columns(); + class ScopedMutableColumns { + public: + explicit ScopedMutableColumns(Block& block); + ~ScopedMutableColumns(); + + ScopedMutableColumns(const ScopedMutableColumns&) = delete; + ScopedMutableColumns& operator=(const ScopedMutableColumns&) = delete; + ScopedMutableColumns(ScopedMutableColumns&& other) noexcept; + ScopedMutableColumns& operator=(ScopedMutableColumns&& other) noexcept; + + MutableColumns& mutable_columns() { return _columns; } + const MutableColumns& mutable_columns() const { return _columns; } + const DataTypePtr& get_datatype_by_position(size_t position) const; + const std::string& get_name_by_position(size_t position) const; + + void restore(); + + private: + Block* _block = nullptr; + MutableColumns _columns; + }; + + class ScopedMutableColumn { + public: + ScopedMutableColumn(Block& block, size_t position); + ~ScopedMutableColumn(); + + ScopedMutableColumn(const ScopedMutableColumn&) = delete; + ScopedMutableColumn& operator=(const ScopedMutableColumn&) = delete; + ScopedMutableColumn(ScopedMutableColumn&& other) noexcept; + ScopedMutableColumn& operator=(ScopedMutableColumn&& other) noexcept; + + MutableColumnPtr& mutable_column() { return _column; } + const MutableColumnPtr& mutable_column() const { return _column; } + + void restore(); + + private: + Block* _block = nullptr; + size_t _position = 0; + MutableColumnPtr _column; + }; + + /** Get columns from a consumed block for mutation. Columns in block will be nullptr. */ + MutableColumns mutate_columns() &&; + MutableColumns mutate_columns() & = delete; + + /** Get columns from a live block for mutation and restore them on every exit path. */ + ScopedMutableColumns mutate_columns_scoped() &; + ScopedMutableColumns mutate_columns_scoped() && = delete; + + /** Get one column from a live block for mutation and restore it on every exit path. */ + ScopedMutableColumn mutate_column_scoped(size_t position) &; + ScopedMutableColumn mutate_column_scoped(size_t position) && = delete; /** Replace columns in a block */ void set_columns(MutableColumns&& columns); @@ -382,25 +434,30 @@ class MutableBlock { std::vector _names; public: - static MutableBlock build_mutable_block(Block* block) { - return block == nullptr ? MutableBlock() : MutableBlock(block); + static MutableBlock build_mutable_block(Block&& block) { + return MutableBlock(std::move(block)); } + static MutableBlock build_mutable_block(std::nullptr_t) { return MutableBlock(); } + static MutableBlock build_mutable_block(Block* block) = delete; MutableBlock() = default; ~MutableBlock() = default; + MutableBlock(const MutableBlock&) = delete; + MutableBlock& operator=(const MutableBlock&) = delete; + MutableBlock(MutableBlock&& m_block) noexcept + : _columns(std::move(m_block._columns)), + _data_types(std::move(m_block._data_types)), + _names(std::move(m_block._names)) {} - MutableBlock(Block* block) - : _columns(block->mutate_columns()), - _data_types(block->get_data_types()), - _names(block->get_names()) {} MutableBlock(Block&& block) - : _columns(block.mutate_columns()), + : _columns(std::move(block).mutate_columns()), _data_types(block.get_data_types()), _names(block.get_names()) {} - void operator=(MutableBlock&& m_block) { + MutableBlock& operator=(MutableBlock&& m_block) noexcept { _columns = std::move(m_block._columns); _data_types = std::move(m_block._data_types); _names = std::move(m_block._names); + return *this; } size_t rows() const; @@ -409,6 +466,7 @@ class MutableBlock { bool empty() const { return rows() == 0; } MutableColumns& mutable_columns() { return _columns; } + const MutableColumns& mutable_columns() const { return _columns; } void set_mutable_columns(MutableColumns&& columns) { _columns = std::move(columns); } @@ -605,6 +663,48 @@ class MutableBlock { std::string dump_names() const; }; +class ScopedMutableBlock { +public: + ScopedMutableBlock() = delete; + explicit ScopedMutableBlock(Block* block) : _block(block) { + DCHECK(_block != nullptr); + _mutable_block = MutableBlock(std::move(*_block)); + } + ~ScopedMutableBlock() { restore(); } + + ScopedMutableBlock(const ScopedMutableBlock&) = delete; + ScopedMutableBlock& operator=(const ScopedMutableBlock&) = delete; + + ScopedMutableBlock(ScopedMutableBlock&& other) noexcept + : _block(std::exchange(other._block, nullptr)), + _mutable_block(std::move(other._mutable_block)) {} + + ScopedMutableBlock& operator=(ScopedMutableBlock&& other) noexcept { + if (this != &other) { + restore(); + _block = std::exchange(other._block, nullptr); + _mutable_block = std::move(other._mutable_block); + } + return *this; + } + + MutableBlock& mutable_block() { return _mutable_block; } + const MutableBlock& mutable_block() const { return _mutable_block; } + MutableColumns& mutable_columns() { return _mutable_block.mutable_columns(); } + const MutableColumns& mutable_columns() const { return _mutable_block.mutable_columns(); } + + void restore() { + if (_block != nullptr) { + _block->set_columns(std::move(_mutable_block.mutable_columns())); + _block = nullptr; + } + } + +private: + Block* _block = nullptr; + MutableBlock _mutable_block; +}; + struct IteratorRowRef { std::shared_ptr block; int row_pos; diff --git a/be/src/core/column/column_fixed_length_object.h b/be/src/core/column/column_fixed_length_object.h index 0a00aa0bcf9e8e..3789eeb868d150 100644 --- a/be/src/core/column/column_fixed_length_object.h +++ b/be/src/core/column/column_fixed_length_object.h @@ -119,6 +119,10 @@ class ColumnFixedLengthObject final : public COWHelper(&_data[n * _item_size]), _item_size}; } + StringRef get_raw_data() const override { + return {reinterpret_cast(_data.data()), _data.size()}; + } + void insert(const Field& x) override { DCHECK_EQ(x.get().length(), _item_size); insert_data(x.get().data(), _item_size); diff --git a/be/src/core/data_type/data_type_array.cpp b/be/src/core/data_type/data_type_array.cpp index b0be7d2a51c6fe..38bb9711347e61 100644 --- a/be/src/core/data_type/data_type_array.cpp +++ b/be/src/core/data_type/data_type_array.cpp @@ -120,15 +120,17 @@ const char* DataTypeArray::deserialize(const char* buf, MutableColumnPtr* column buf = deserialize_const_flag_and_row_num(buf, column, &real_have_saved_num); auto* data_column = assert_cast(origin_column); - auto& offsets = data_column->get_offsets(); // offsets + auto offsets_column = std::move(*data_column->get_offsets_ptr()).mutate(); + auto& offsets = assert_cast(*offsets_column).get_data(); offsets.resize(real_have_saved_num); memcpy(offsets.data(), buf, sizeof(ColumnArray::Offset64) * real_have_saved_num); buf += sizeof(ColumnArray::Offset64) * real_have_saved_num; // children auto nested_column = std::move(*data_column->get_data_ptr()).mutate(); buf = get_nested_type()->deserialize(buf, &nested_column, be_exec_version); + data_column->get_offsets_ptr() = std::move(offsets_column); data_column->get_data_ptr() = std::move(nested_column); return buf; } diff --git a/be/src/core/data_type/data_type_map.cpp b/be/src/core/data_type/data_type_map.cpp index 043fd7a70248f3..c0292526701531 100644 --- a/be/src/core/data_type/data_type_map.cpp +++ b/be/src/core/data_type/data_type_map.cpp @@ -129,8 +129,9 @@ const char* DataTypeMap::deserialize(const char* buf, MutableColumnPtr* column, buf = deserialize_const_flag_and_row_num(buf, column, &real_have_saved_num); auto* map_column = assert_cast(origin_column); - auto& map_offsets = map_column->get_offsets(); // offsets + auto offsets_column = std::move(*map_column->get_offsets_ptr()).mutate(); + auto& map_offsets = assert_cast(*offsets_column).get_data(); map_offsets.resize(real_have_saved_num); memcpy(map_offsets.data(), buf, sizeof(ColumnArray::Offset64) * real_have_saved_num); buf += sizeof(ColumnArray::Offset64) * real_have_saved_num; @@ -139,6 +140,7 @@ const char* DataTypeMap::deserialize(const char* buf, MutableColumnPtr* column, auto nested_values_column = std::move(*map_column->get_values_ptr()).mutate(); buf = get_key_type()->deserialize(buf, &nested_keys_column, be_exec_version); buf = get_value_type()->deserialize(buf, &nested_values_column, be_exec_version); + map_column->get_offsets_ptr() = std::move(offsets_column); map_column->get_keys_ptr() = std::move(nested_keys_column); map_column->get_values_ptr() = std::move(nested_values_column); return buf; diff --git a/be/src/exec/common/partition_sort_utils.cpp b/be/src/exec/common/partition_sort_utils.cpp index 09f834532d5940..ed042b1686dcbe 100644 --- a/be/src/exec/common/partition_sort_utils.cpp +++ b/be/src/exec/common/partition_sort_utils.cpp @@ -28,13 +28,15 @@ Status PartitionBlocks::append_block_by_selector(const Block* input_block, bool _blocks.push_back(Block::create_unique( VectorizedUtils::create_empty_block(_partition_sort_info->_row_desc))); } - auto columns = input_block->get_columns(); - auto mutable_columns = _blocks.back()->mutate_columns(); - DCHECK(columns.size() == mutable_columns.size()); - for (int i = 0; i < mutable_columns.size(); ++i) { - columns[i]->append_data_by_selector(mutable_columns[i], _selector); + { + auto columns = input_block->get_columns(); + auto mutable_columns_guard = _blocks.back()->mutate_columns_scoped(); + auto& mutable_columns = mutable_columns_guard.mutable_columns(); + DCHECK(columns.size() == mutable_columns.size()); + for (int i = 0; i < mutable_columns.size(); ++i) { + columns[i]->append_data_by_selector(mutable_columns[i], _selector); + } } - _blocks.back()->set_columns(std::move(mutable_columns)); _init_rows = _init_rows - selector_rows; _current_input_rows = _current_input_rows + selector_rows; _selector.clear(); diff --git a/be/src/exec/common/util.hpp b/be/src/exec/common/util.hpp index a729142ce92239..477005f709d003 100644 --- a/be/src/exec/common/util.hpp +++ b/be/src/exec/common/util.hpp @@ -36,22 +36,24 @@ class VectorizedUtils { // Block block; return create_columns_with_type_and_name(row_desc); } - static MutableBlock build_mutable_mem_reuse_block(Block* block, const RowDescriptor& row_desc) { + static ScopedMutableBlock build_scoped_mutable_mem_reuse_block(Block* block, + const RowDescriptor& row_desc) { if (!block->mem_reuse()) { MutableBlock tmp(VectorizedUtils::create_columns_with_type_and_name(row_desc)); block->swap(tmp.to_block()); } - return MutableBlock::build_mutable_block(block); + return ScopedMutableBlock(block); } - static MutableBlock build_mutable_mem_reuse_block(Block* block, const Block& other) { + static ScopedMutableBlock build_scoped_mutable_mem_reuse_block(Block* block, + const Block& other) { if (!block->mem_reuse()) { MutableBlock tmp(other.clone_empty()); block->swap(tmp.to_block()); } - return MutableBlock::build_mutable_block(block); + return ScopedMutableBlock(block); } - static MutableBlock build_mutable_mem_reuse_block(Block* block, - const std::vector& slots) { + static ScopedMutableBlock build_scoped_mutable_mem_reuse_block( + Block* block, const std::vector& slots) { if (!block->mem_reuse()) { size_t column_size = slots.size(); MutableColumns columns(column_size); @@ -65,7 +67,7 @@ class VectorizedUtils { slot_desc->col_name())); } } - return MutableBlock(block); + return ScopedMutableBlock(block); } static ColumnsWithTypeAndName create_columns_with_type_and_name(const RowDescriptor& row_desc) { diff --git a/be/src/exec/exchange/local_exchanger.cpp b/be/src/exec/exchange/local_exchanger.cpp index a248940dc63c81..c83a2c9cecb63a 100644 --- a/be/src/exec/exchange/local_exchanger.cpp +++ b/be/src/exec/exchange/local_exchanger.cpp @@ -146,9 +146,12 @@ void ShuffleExchanger::close(SourceInfo&& source_info) { Status ShuffleExchanger::get_block(RuntimeState* state, Block* block, bool* eos, Profile&& profile, SourceInfo&& source_info) { PartitionedBlock partitioned_block; - MutableBlock mutable_block; - - auto get_data = [&]() -> Status { + if (_dequeue_data(source_info.local_state, partitioned_block, eos, block, + source_info.channel_id)) { + SCOPED_TIMER(profile.copy_data_timer); + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( + block, partitioned_block.first->_data_block); + auto& mutable_block = scoped_mutable_block.mutable_block(); do { const auto* offset_start = partitioned_block.second.row_idxs->data() + partitioned_block.second.offset_start; @@ -158,16 +161,6 @@ Status ShuffleExchanger::get_block(RuntimeState* state, Block* block, bool* eos, } while (mutable_block.rows() < state->batch_size() && !*eos && _dequeue_data(source_info.local_state, partitioned_block, eos, block, source_info.channel_id)); - return Status::OK(); - }; - - if (_dequeue_data(source_info.local_state, partitioned_block, eos, block, - source_info.channel_id)) { - SCOPED_TIMER(profile.copy_data_timer); - mutable_block = VectorizedUtils::build_mutable_mem_reuse_block( - block, partitioned_block.first->_data_block); - RETURN_IF_ERROR(get_data()); - block->set_columns(std::move(mutable_block.mutable_columns())); } return Status::OK(); } @@ -420,13 +413,13 @@ Status BroadcastExchanger::get_block(RuntimeState* state, Block* block, bool* eo if (_dequeue_data(source_info.local_state, partitioned_block, eos, block, source_info.channel_id)) { SCOPED_TIMER(profile.copy_data_timer); - MutableBlock mutable_block = VectorizedUtils::build_mutable_mem_reuse_block( + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( block, partitioned_block.first->_data_block); + auto& mutable_block = scoped_mutable_block.mutable_block(); auto block_wrapper = partitioned_block.first; RETURN_IF_ERROR(mutable_block.add_rows(&block_wrapper->_data_block, partitioned_block.second.offset_start, partitioned_block.second.length)); - block->set_columns(std::move(mutable_block.mutable_columns())); } return Status::OK(); @@ -542,9 +535,12 @@ Status AdaptivePassthroughExchanger::get_block(RuntimeState* state, Block* block return Status::OK(); } PartitionedBlock partitioned_block; - MutableBlock mutable_block; - - auto get_data = [&]() -> Status { + if (_dequeue_data(source_info.local_state, partitioned_block, eos, block, + source_info.channel_id)) { + SCOPED_TIMER(profile.copy_data_timer); + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( + block, partitioned_block.first->_data_block); + auto& mutable_block = scoped_mutable_block.mutable_block(); do { if (partitioned_block.second.row_idxs == nullptr) { // The passthrough path which means the block is not partitioned, we can directly move the block without copying. @@ -554,6 +550,7 @@ Status AdaptivePassthroughExchanger::get_block(RuntimeState* state, Block* block _tmp_eos[source_info.channel_id] = *eos; *eos = false; } else { + scoped_mutable_block.restore(); *block = std::move(partitioned_block.first->_data_block); } break; @@ -566,18 +563,6 @@ Status AdaptivePassthroughExchanger::get_block(RuntimeState* state, Block* block } while (mutable_block.rows() < state->batch_size() && !*eos && _dequeue_data(source_info.local_state, partitioned_block, eos, block, source_info.channel_id)); - return Status::OK(); - }; - - if (_dequeue_data(source_info.local_state, partitioned_block, eos, block, - source_info.channel_id)) { - SCOPED_TIMER(profile.copy_data_timer); - mutable_block = VectorizedUtils::build_mutable_mem_reuse_block( - block, partitioned_block.first->_data_block); - RETURN_IF_ERROR(get_data()); - if (mutable_block.rows() > 0) { - block->set_columns(std::move(mutable_block.mutable_columns())); - } } return Status::OK(); } diff --git a/be/src/exec/exchange/vdata_stream_sender.cpp b/be/src/exec/exchange/vdata_stream_sender.cpp index 04e68aeb136d13..72767deac23456 100644 --- a/be/src/exec/exchange/vdata_stream_sender.cpp +++ b/be/src/exec/exchange/vdata_stream_sender.cpp @@ -329,7 +329,7 @@ Status BlockSerializer::_serialize_block(PBlock* dest, size_t num_receivers) { reset_block(); } else { block.clear_column_data(); - _mutable_block->set_mutable_columns(block.mutate_columns()); + _mutable_block->set_mutable_columns(std::move(block).mutate_columns()); } } diff --git a/be/src/exec/operator/cache_source_operator.cpp b/be/src/exec/operator/cache_source_operator.cpp index 06731ff8ed54c0..6f2dc9e084e6c9 100644 --- a/be/src/exec/operator/cache_source_operator.cpp +++ b/be/src/exec/operator/cache_source_operator.cpp @@ -156,9 +156,10 @@ Status CacheSourceOperatorX::get_block(RuntimeState* state, Block* block, bool* if (need_clone_empty) { *block = output_block->clone_empty(); } - auto mutable_block = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mutable_block(block); + auto& mutable_block = scoped_mutable_block.mutable_block(); RETURN_IF_ERROR(mutable_block.merge(*output_block)); - block->set_columns(std::move(mutable_block.mutable_columns())); + scoped_mutable_block.restore(); local_state._current_query_cache_rows += output_block->rows(); auto mem_consume = output_block->allocated_bytes(); local_state._current_query_cache_bytes += mem_consume; @@ -181,9 +182,10 @@ Status CacheSourceOperatorX::get_block(RuntimeState* state, Block* block, bool* if (need_clone_empty) { *block = hit_cache_block->clone_empty(); } - auto mutable_block = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mutable_block(block); + auto& mutable_block = scoped_mutable_block.mutable_block(); RETURN_IF_ERROR(mutable_block.merge(*hit_cache_block)); - block->set_columns(std::move(mutable_block.mutable_columns())); + scoped_mutable_block.restore(); if (!local_state._hit_cache_column_orders.empty()) { auto datas = block->get_columns_with_type_and_name(); block->clear(); diff --git a/be/src/exec/operator/exchange_sink_operator.cpp b/be/src/exec/operator/exchange_sink_operator.cpp index 35698f5217d709..e65dd979ad21ea 100644 --- a/be/src/exec/operator/exchange_sink_operator.cpp +++ b/be/src/exec/operator/exchange_sink_operator.cpp @@ -509,7 +509,7 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, Block* block, bool eos) } else { cur_block.clear_column_data(); local_state._serializer.get_block()->set_mutable_columns( - cur_block.mutate_columns()); + std::move(cur_block).mutate_columns()); } } } diff --git a/be/src/exec/operator/group_commit_block_sink_operator.cpp b/be/src/exec/operator/group_commit_block_sink_operator.cpp index f29029ead7cc26..a72755720d5b77 100644 --- a/be/src/exec/operator/group_commit_block_sink_operator.cpp +++ b/be/src/exec/operator/group_commit_block_sink_operator.cpp @@ -372,7 +372,7 @@ Status GroupCommitBlockSinkOperatorX::sink(RuntimeState* state, Block* input_blo if (local_state._block_convertor->num_filtered_rows() > 0 || local_state._has_filtered_rows) { auto cloneBlock = block->clone_without_columns(); - auto res_block = MutableBlock::build_mutable_block(&cloneBlock); + auto res_block = MutableBlock::build_mutable_block(std::move(cloneBlock)); for (int i = 0; i < rows; ++i) { if (local_state._block_convertor->filter_map()[i]) { continue; diff --git a/be/src/exec/operator/hashjoin_build_sink.cpp b/be/src/exec/operator/hashjoin_build_sink.cpp index 4a35b07b8e7222..3071e5e53225e5 100644 --- a/be/src/exec/operator/hashjoin_build_sink.cpp +++ b/be/src/exec/operator/hashjoin_build_sink.cpp @@ -832,7 +832,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, Block* in_block, bo *local_state._build_expr_call_timer, local_state._build_col_ids)); local_state._build_side_mutable_block = - MutableBlock::build_mutable_block(&tmp_build_block); + MutableBlock::build_mutable_block(std::move(tmp_build_block)); } if (!in_block->empty()) { diff --git a/be/src/exec/operator/hashjoin_probe_operator.cpp b/be/src/exec/operator/hashjoin_probe_operator.cpp index 9b913cc9b23451..ea4b812323a9e6 100644 --- a/be/src/exec/operator/hashjoin_probe_operator.cpp +++ b/be/src/exec/operator/hashjoin_probe_operator.cpp @@ -232,7 +232,8 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, Block* output_bl local_state._join_block.clear_column_data(); - MutableBlock mutable_join_block(&local_state._join_block); + ScopedMutableBlock scoped_mutable_join_block(&local_state._join_block); + auto& mutable_join_block = scoped_mutable_join_block.mutable_block(); Block temp_block; Status st; @@ -313,8 +314,8 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, Block* output_bl state, output_block, eos, &temp_block, !local_state._shared_state->left_semi_direct_return)); // Here make _join_block release the columns' ptr + scoped_mutable_join_block.restore(); local_state._join_block.set_columns(local_state._join_block.clone_empty_columns()); - mutable_join_block.clear(); return Status::OK(); } diff --git a/be/src/exec/operator/nested_loop_join_probe_operator.cpp b/be/src/exec/operator/nested_loop_join_probe_operator.cpp index 7fdb3f61bd63c7..90d580e2234733 100644 --- a/be/src/exec/operator/nested_loop_join_probe_operator.cpp +++ b/be/src/exec/operator/nested_loop_join_probe_operator.cpp @@ -162,7 +162,8 @@ void NestedLoopJoinProbeLocalState::_reset_with_next_probe_row() { void process_probe_block(int64_t probe_block_pos, Block& block, const Block& probe_block, size_t probe_side_columns, const Block& build_block, size_t build_side_columns) { - auto dst_columns = block.mutate_columns(); + auto dst_columns_guard = block.mutate_columns_scoped(); + auto& dst_columns = dst_columns_guard.mutable_columns(); const size_t max_added_rows = build_block.rows(); for (size_t i = 0; i < probe_side_columns; ++i) { const ColumnWithTypeAndName& src_column = probe_block.get_by_position(i); @@ -197,13 +198,13 @@ void process_probe_block(int64_t probe_block_pos, Block& block, const Block& pro max_added_rows); } } - block.set_columns(std::move(dst_columns)); } void process_build_block(int64_t build_block_pos, Block& block, const Block& build_block, size_t build_side_columns, const Block& probe_block, size_t probe_side_columns) { - auto dst_columns = block.mutate_columns(); + auto dst_columns_guard = block.mutate_columns_scoped(); + auto& dst_columns = dst_columns_guard.mutable_columns(); const size_t max_added_rows = probe_block.rows(); for (size_t i = 0; i < probe_side_columns; ++i) { const ColumnWithTypeAndName& src_column = probe_block.get_by_position(i); @@ -237,7 +238,6 @@ void process_build_block(int64_t build_block_pos, Block& block, const Block& bui build_block_pos, max_added_rows); } } - block.set_columns(std::move(dst_columns)); } void NestedLoopJoinProbeLocalState::_replace_lazy_placeholder_columns(size_t rows) { @@ -983,7 +983,8 @@ template // NOLINTNEXTLINE(readability-function-size,readability-function-cognitive-complexity): existing finalization handles multiple join variants. void NestedLoopJoinProbeLocalState::_finalize_current_phase(Block& block, size_t batch_size) { auto& p = _parent->cast(); - auto dst_columns = block.mutate_columns(); + auto dst_columns_guard = block.mutate_columns_scoped(); + auto& dst_columns = dst_columns_guard.mutable_columns(); DCHECK_GT(dst_columns.size(), 0); auto column_size = dst_columns[0]->size(); if constexpr (BuildSide) { @@ -1092,12 +1093,12 @@ void NestedLoopJoinProbeLocalState::_finalize_current_phase(Block& block, size_t } } } - block.set_columns(std::move(dst_columns)); } void NestedLoopJoinProbeLocalState::_append_probe_data_with_null(Block& block) const { auto& p = _parent->cast(); - auto dst_columns = block.mutate_columns(); + auto dst_columns_guard = block.mutate_columns_scoped(); + auto& dst_columns = dst_columns_guard.mutable_columns(); DCHECK(p._is_mark_join); for (size_t i = 0; i < p._num_probe_side_columns; ++i) { const ColumnWithTypeAndName& src_column = _child_block->get_by_position(i); @@ -1123,7 +1124,6 @@ void NestedLoopJoinProbeLocalState::_append_probe_data_with_null(Block& block) c } auto& mark_column = *dst_columns[dst_columns.size() - 1]; ColumnFilterHelper(mark_column).resize_fill(mark_column.size() + _probe_side_process_count, 0); - block.set_columns(std::move(dst_columns)); } NestedLoopJoinProbeOperatorX::NestedLoopJoinProbeOperatorX(ObjectPool* pool, const TPlanNode& tnode, diff --git a/be/src/exec/operator/operator.cpp b/be/src/exec/operator/operator.cpp index f1a4734bcedb4d..6f7bd22539aaeb 100644 --- a/be/src/exec/operator/operator.cpp +++ b/be/src/exec/operator/operator.cpp @@ -360,8 +360,9 @@ Status OperatorXBase::do_projections(RuntimeState* state, Block* origin_block, } }; - MutableBlock mutable_block = - VectorizedUtils::build_mutable_mem_reuse_block(output_block, *_output_row_descriptor); + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( + output_block, *_output_row_descriptor); + auto& mutable_block = scoped_mutable_block.mutable_block(); auto& mutable_columns = mutable_block.mutable_columns(); if (rows != 0) { DCHECK_EQ(mutable_columns.size(), local_state->_projections.size()) << debug_string(); @@ -380,8 +381,6 @@ Status OperatorXBase::do_projections(RuntimeState* state, Block* origin_block, } DCHECK(mutable_block.rows() == rows); } - output_block->set_columns(std::move(mutable_columns)); - local_state->_estimate_memory_usage += bytes_usage; return Status::OK(); diff --git a/be/src/exec/operator/partitioned_aggregation_sink_operator.cpp b/be/src/exec/operator/partitioned_aggregation_sink_operator.cpp index 07c96959d98344..42739b9b2acf77 100644 --- a/be/src/exec/operator/partitioned_aggregation_sink_operator.cpp +++ b/be/src/exec/operator/partitioned_aggregation_sink_operator.cpp @@ -512,8 +512,10 @@ void PartitionedAggSinkLocalState::_reset_tmp_data() { _value_columns.clear(); _key_block.clear_column_data(); _value_block.clear_column_data(); - _key_columns = _key_block.mutate_columns(); - _value_columns = _value_block.mutate_columns(); + // _key_columns/_value_columns own the mutable storage until the next reset. The schema blocks + // are used only as empty reusable owners here, so consuming their columns is intentional. + _key_columns = std::move(_key_block).mutate_columns(); + _value_columns = std::move(_value_block).mutate_columns(); } void PartitionedAggSinkLocalState::_clear_tmp_data() { diff --git a/be/src/exec/operator/repeat_operator.cpp b/be/src/exec/operator/repeat_operator.cpp index b0aa6989a35f34..b659052187b17a 100644 --- a/be/src/exec/operator/repeat_operator.cpp +++ b/be/src/exec/operator/repeat_operator.cpp @@ -112,7 +112,9 @@ Status RepeatLocalState::get_repeated_block(Block* input_block, int repeat_id_id size_t input_column_size = input_block->columns(); size_t output_column_size = p._output_slots.size(); DCHECK_LT(input_column_size, output_column_size); - auto m_block = VectorizedUtils::build_mutable_mem_reuse_block(output_block, p._output_slots); + auto scoped_mutable_block = + VectorizedUtils::build_scoped_mutable_mem_reuse_block(output_block, p._output_slots); + auto& m_block = scoped_mutable_block.mutable_block(); auto& output_columns = m_block.mutable_columns(); /* Fill all slots according to child, for example:select tc1,tc2,sum(tc3) from t1 group by grouping sets((tc1),(tc2)); * insert into t1 values(1,2,1),(1,3,1),(2,1,1),(3,1,1); @@ -154,8 +156,6 @@ Status RepeatLocalState::get_repeated_block(Block* input_block, int repeat_id_id RETURN_IF_ERROR(add_grouping_id_column(rows, cur_col, output_columns, repeat_id_idx)); DCHECK_EQ(cur_col, output_column_size); - output_block->set_columns(std::move(m_block.mutable_columns())); - return Status::OK(); } @@ -230,15 +230,15 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, Block* output_block, bo _repeat_id_idx = 0; } } else if (local_state._expr_ctxs.empty()) { - auto m_block = - VectorizedUtils::build_mutable_mem_reuse_block(output_block, _output_slots); + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( + output_block, _output_slots); + auto& m_block = scoped_mutable_block.mutable_block(); auto rows = _child_block.rows(); auto& columns = m_block.mutable_columns(); std::size_t cur_col = 0; RETURN_IF_ERROR( local_state.add_grouping_id_column(rows, cur_col, columns, _repeat_id_idx)); - output_block->set_columns(std::move(m_block.mutable_columns())); _repeat_id_idx++; if (_repeat_id_idx >= _repeat_id_list_size) { diff --git a/be/src/exec/operator/set_sink_operator.cpp b/be/src/exec/operator/set_sink_operator.cpp index 608bd8c1b69e58..ec2c717c2cf9f8 100644 --- a/be/src/exec/operator/set_sink_operator.cpp +++ b/be/src/exec/operator/set_sink_operator.cpp @@ -80,7 +80,8 @@ Status SetSinkOperatorX::sink(RuntimeState* state, Block* in_block if (in_block->rows() != 0) { if (local_state._mutable_block.empty()) { auto tmp_build_block = *(in_block->create_same_struct_block(0, false)); - local_state._mutable_block = MutableBlock::build_mutable_block(&tmp_build_block); + local_state._mutable_block = + MutableBlock::build_mutable_block(std::move(tmp_build_block)); } { diff --git a/be/src/exec/operator/table_function_operator.cpp b/be/src/exec/operator/table_function_operator.cpp index fd97e8d69c68a8..397a9754620632 100644 --- a/be/src/exec/operator/table_function_operator.cpp +++ b/be/src/exec/operator/table_function_operator.cpp @@ -485,8 +485,9 @@ Status TableFunctionLocalState::get_expanded_block(RuntimeState* state, Block* o } auto& p = _parent->cast(); - MutableBlock m_block = - VectorizedUtils::build_mutable_mem_reuse_block(output_block, p._output_slots); + auto scoped_mutable_block = + VectorizedUtils::build_scoped_mutable_mem_reuse_block(output_block, p._output_slots); + auto& m_block = scoped_mutable_block.mutable_block(); MutableColumns& columns = m_block.mutable_columns(); for (int i = 0; i < p._fn_num; i++) { @@ -560,7 +561,7 @@ Status TableFunctionLocalState::get_expanded_block(RuntimeState* state, Block* o for (auto index : p._useless_slot_indexs) { columns[index]->insert_many_defaults(row_size - columns[index]->size()); } - output_block->set_columns(std::move(columns)); + scoped_mutable_block.restore(); { SCOPED_TIMER(_filter_timer); // 3. eval conjuncts @@ -578,8 +579,9 @@ Status TableFunctionLocalState::_get_expanded_block_for_outer_conjuncts(RuntimeS Block* output_block, bool* eos) { auto& p = _parent->cast(); - MutableBlock m_block = - VectorizedUtils::build_mutable_mem_reuse_block(output_block, p._output_slots); + auto scoped_mutable_block = + VectorizedUtils::build_scoped_mutable_mem_reuse_block(output_block, p._output_slots); + auto& m_block = scoped_mutable_block.mutable_block(); MutableColumns& columns = m_block.mutable_columns(); auto child_slot_count = p._child_slots.size(); for (int i = 0; i < p._fn_num; i++) { @@ -648,7 +650,7 @@ Status TableFunctionLocalState::_get_expanded_block_for_outer_conjuncts(RuntimeS for (auto index : p._useless_slot_indexs) { columns[index]->insert_many_defaults(output_row_count - columns[index]->size()); } - output_block->set_columns(std::move(columns)); + scoped_mutable_block.restore(); /** Handle the outer conjuncts after unnest. Currently, only left outer is supported. @@ -746,8 +748,9 @@ Status TableFunctionLocalState::_get_expanded_block_for_outer_conjuncts(RuntimeS } } if (!null_row_indices.empty()) { - MutableBlock m_block2 = VectorizedUtils::build_mutable_mem_reuse_block( + auto scoped_mutable_block2 = VectorizedUtils::build_scoped_mutable_mem_reuse_block( output_block, p._output_slots); + auto& m_block2 = scoped_mutable_block2.mutable_block(); MutableColumns& columns2 = m_block2.mutable_columns(); for (auto index : p._output_slot_indexs) { auto src_column = _child_block->get_by_position(index).column; @@ -759,7 +762,6 @@ Status TableFunctionLocalState::_get_expanded_block_for_outer_conjuncts(RuntimeS columns2[index]->insert_many_defaults(null_row_indices.size()); } columns2[child_slot_count]->insert_many_defaults(null_row_indices.size()); - output_block->set_columns(std::move(columns2)); } _child_rows_has_output.clear(); _child_block->clear_column_data(_parent->cast() diff --git a/be/src/exec/operator/union_sink_operator.h b/be/src/exec/operator/union_sink_operator.h index bdfb4a7303126e..14978ae4526178 100644 --- a/be/src/exec/operator/union_sink_operator.h +++ b/be/src/exec/operator/union_sink_operator.h @@ -157,8 +157,9 @@ class UnionSinkOperatorX MOCK_REMOVE(final) : public DataSinkOperatorXrows() > 0) { - MutableBlock mblock = - VectorizedUtils::build_mutable_mem_reuse_block(output_block, row_descriptor()); + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( + output_block, row_descriptor()); + auto& mblock = scoped_mutable_block.mutable_block(); Block res; auto& local_state = get_local_state(state); { @@ -168,7 +169,6 @@ class UnionSinkOperatorX MOCK_REMOVE(final) : public DataSinkOperatorXset_columns(std::move(mblock.mutable_columns())); } return Status::OK(); } diff --git a/be/src/exec/operator/union_source_operator.cpp b/be/src/exec/operator/union_source_operator.cpp index 9547c9a8184bcf..0efe4ed4efdb42 100644 --- a/be/src/exec/operator/union_source_operator.cpp +++ b/be/src/exec/operator/union_source_operator.cpp @@ -149,7 +149,9 @@ Status UnionSourceOperatorX::get_next_const(RuntimeState* state, Block* block) { SCOPED_PEAK_MEM(&local_state._estimate_memory_usage); auto& _const_expr_list_idx = local_state._const_expr_list_idx; - MutableBlock mblock = VectorizedUtils::build_mutable_mem_reuse_block(block, row_descriptor()); + auto scoped_mutable_block = + VectorizedUtils::build_scoped_mutable_mem_reuse_block(block, row_descriptor()); + auto& mblock = scoped_mutable_block.mutable_block(); ColumnsWithTypeAndName tmp_block_columns; for (; _const_expr_list_idx < _const_expr_lists.size() && mblock.rows() < state->batch_size(); @@ -177,9 +179,7 @@ Status UnionSourceOperatorX::get_next_const(RuntimeState* state, Block* block) { tmp_block.clear(); } } - if (mblock.rows() > 0) { - block->set_columns(std::move(mblock.mutable_columns())); - } + scoped_mutable_block.restore(); // some insert query like "insert into string_test select 1, repeat('a', 1024 * 1024);" // the const expr will be in output expr cause the union node return a empty block. so here we diff --git a/be/src/exec/rowid_fetcher.cpp b/be/src/exec/rowid_fetcher.cpp index 7241520013c3a7..34e124c421967e 100644 --- a/be/src/exec/rowid_fetcher.cpp +++ b/be/src/exec/rowid_fetcher.cpp @@ -164,8 +164,8 @@ Status RowIDFetcher::_merge_rpc_results(const PMultiGetRequest& request, default_values[i] = _fetch_option.desc->slots()[i]->col_default_value(); } } - MutableColumns output_columns = output_block->mutate_columns(); - Defer restore_columns([&]() { output_block->set_columns(std::move(output_columns)); }); + auto output_columns_guard = output_block->mutate_columns_scoped(); + MutableColumns& output_columns = output_columns_guard.mutable_columns(); for (int i = 0; i < resp.binary_row_data_size(); ++i) { RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_columns( serdes, resp.binary_row_data(i).data(), resp.binary_row_data(i).size(), @@ -1124,8 +1124,8 @@ Status RowIdStorageReader::read_doris_format_row( return Status::InternalError("Tablet {} does not have row store for all columns", tablet->tablet_id()); } - MutableColumns result_columns = result_block.mutate_columns(); - Defer restore_columns([&]() { result_block.set_columns(std::move(result_columns)); }); + auto result_columns_guard = result_block.mutate_columns_scoped(); + MutableColumns& result_columns = result_columns_guard.mutable_columns(); for (auto row_id : row_ids) { RowLocation loc(rowset_id, segment->id(), cast_set(row_id)); row_store_read_struct.row_store_buffer.clear(); diff --git a/be/src/exec/scan/file_scanner.cpp b/be/src/exec/scan/file_scanner.cpp index f621050e337d8f..c0a79f9f38d9ec 100644 --- a/be/src/exec/scan/file_scanner.cpp +++ b/be/src/exec/scan/file_scanner.cpp @@ -775,8 +775,9 @@ Status FileScanner::_convert_to_output_block(Block* block) { // After convert, the column_ptr should be copied into output block. // Can not use block->insert() because it may cause use_count() non-zero bug - MutableBlock mutable_output_block = - VectorizedUtils::build_mutable_mem_reuse_block(block, *_dest_row_desc); + auto scoped_mutable_output_block = + VectorizedUtils::build_scoped_mutable_mem_reuse_block(block, *_dest_row_desc); + auto& mutable_output_block = scoped_mutable_output_block.mutable_block(); auto& mutable_output_columns = mutable_output_block.mutable_columns(); std::vector* skip_bitmaps {nullptr}; @@ -868,7 +869,7 @@ Status FileScanner::_convert_to_output_block(Block* block) { mutable_output_columns[j]->insert_range_from(*column_ptr, 0, rows); ctx_idx++; } - block->set_columns(std::move(mutable_output_columns)); + scoped_mutable_output_block.restore(); // after do the dest block insert operation, clear _src_block to remove the reference of origin column _src_block_ptr->clear(); diff --git a/be/src/exec/scan/scanner.cpp b/be/src/exec/scan/scanner.cpp index 5c9edc4f23dcab..ab76b884ef04fa 100644 --- a/be/src/exec/scan/scanner.cpp +++ b/be/src/exec/scan/scanner.cpp @@ -225,8 +225,9 @@ Status Scanner::_do_projections(Block* origin_block, Block* output_block) { } DCHECK_EQ(rows, input_block.rows()); - MutableBlock mutable_block = - VectorizedUtils::build_mutable_mem_reuse_block(output_block, *_output_row_descriptor); + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( + output_block, *_output_row_descriptor); + auto& mutable_block = scoped_mutable_block.mutable_block(); auto& mutable_columns = mutable_block.mutable_columns(); @@ -242,7 +243,7 @@ Status Scanner::_do_projections(Block* origin_block, Block* output_block) { mutable_columns[i] = IColumn::mutate(std::move(column_ptr)); } - output_block->set_columns(std::move(mutable_columns)); + scoped_mutable_block.restore(); // origin columns was moved into output_block, so we need to set origin_block to empty columns auto empty_columns = origin_block->clone_empty_columns(); diff --git a/be/src/exec/scan/scanner.h b/be/src/exec/scan/scanner.h index b7ff196a265097..4f5d511e94b573 100644 --- a/be/src/exec/scan/scanner.h +++ b/be/src/exec/scan/scanner.h @@ -115,9 +115,9 @@ class Scanner { if (_padding_block.empty()) { _padding_block.swap(_origin_block); } else if (_origin_block.rows()) { - auto mutable_block = MutableBlock::build_mutable_block(&_padding_block); + ScopedMutableBlock scoped_mutable_block(&_padding_block); + auto& mutable_block = scoped_mutable_block.mutable_block(); RETURN_IF_ERROR(mutable_block.merge(_origin_block)); - _padding_block.set_columns(std::move(mutable_block.mutable_columns())); } return Status::OK(); } diff --git a/be/src/exec/sink/writer/vtablet_writer.cpp b/be/src/exec/sink/writer/vtablet_writer.cpp index c77a94c501585b..0eb98ae48dd711 100644 --- a/be/src/exec/sink/writer/vtablet_writer.cpp +++ b/be/src/exec/sink/writer/vtablet_writer.cpp @@ -1760,10 +1760,12 @@ Status VTabletWriter::_send_new_partition_batch() { // 2. deal batched block // 3. now reuse the column of lval block. cuz write doesn't real adjust it. it generate a new block from that. _row_distribution.clear_batching_stats(); + Defer recover_batching_block([&]() { + _row_distribution._batching_block->set_mutable_columns( + std::move(tmp_block).mutate_columns()); + _row_distribution._batching_block->clear_column_data(); + }); RETURN_IF_ERROR(this->write(_state, tmp_block)); - _row_distribution._batching_block->set_mutable_columns( - tmp_block.mutate_columns()); // Recovery back - _row_distribution._batching_block->clear_column_data(); _row_distribution._deal_batched = false; } return Status::OK(); diff --git a/be/src/exec/sink/writer/vtablet_writer_v2.cpp b/be/src/exec/sink/writer/vtablet_writer_v2.cpp index 17f41063c6a33d..31a2d78819207d 100644 --- a/be/src/exec/sink/writer/vtablet_writer_v2.cpp +++ b/be/src/exec/sink/writer/vtablet_writer_v2.cpp @@ -620,10 +620,12 @@ Status VTabletWriterV2::_send_new_partition_batch() { // 2. deal batched block // 3. now reuse the column of lval block. cuz write doesn't real adjust it. it generate a new block from that. _row_distribution.clear_batching_stats(); + Defer recover_batching_block([&]() { + _row_distribution._batching_block->set_mutable_columns( + std::move(tmp_block).mutate_columns()); + _row_distribution._batching_block->clear_column_data(); + }); RETURN_IF_ERROR(this->write(_state, tmp_block)); - _row_distribution._batching_block->set_mutable_columns( - tmp_block.mutate_columns()); // Recovery back - _row_distribution._batching_block->clear_column_data(); _row_distribution._deal_batched = false; } return Status::OK(); diff --git a/be/src/exec/sort/partition_sorter.cpp b/be/src/exec/sort/partition_sorter.cpp index 89be3b90dc6fb1..87b915990d1dfa 100644 --- a/be/src/exec/sort/partition_sorter.cpp +++ b/be/src/exec/sort/partition_sorter.cpp @@ -100,8 +100,9 @@ Status PartitionSorter::_read_row_num(Block* output_block, bool* eos, int batch_ auto& queue = _state->get_queue(); size_t num_columns = _state->unsorted_block()->columns(); - MutableBlock m_block = - VectorizedUtils::build_mutable_mem_reuse_block(output_block, *_state->unsorted_block()); + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( + output_block, *_state->unsorted_block()); + auto& m_block = scoped_mutable_block.mutable_block(); MutableColumns& merged_columns = m_block.mutable_columns(); size_t merged_rows = 0; @@ -121,10 +122,11 @@ Status PartitionSorter::_read_row_num(Block* output_block, bool* eos, int batch_ if (current->impl->is_last(step) && current->impl->pos == 0) { if (merged_rows != 0) { // return directly for next time's read swap whole block - output_block->set_columns(std::move(merged_columns)); + scoped_mutable_block.restore(); return Status::OK(); } // swap and return block directly when we should get all data from cursor + scoped_mutable_block.restore(); output_block->swap(*current->impl->block); merged_rows += step; _output_total_rows += step; @@ -148,7 +150,6 @@ Status PartitionSorter::_read_row_num(Block* output_block, bool* eos, int batch_ } } - output_block->set_columns(std::move(merged_columns)); return Status::OK(); } @@ -156,8 +157,9 @@ Status PartitionSorter::_read_row_rank(Block* output_block, bool* eos, int batch auto& queue = _state->get_queue(); size_t num_columns = _state->unsorted_block()->columns(); - MutableBlock m_block = - VectorizedUtils::build_mutable_mem_reuse_block(output_block, *_state->unsorted_block()); + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( + output_block, *_state->unsorted_block()); + auto& m_block = scoped_mutable_block.mutable_block(); MutableColumns& merged_columns = m_block.mutable_columns(); size_t merged_rows = 0; @@ -180,7 +182,7 @@ Status PartitionSorter::_read_row_rank(Block* output_block, bool* eos, int batch // rank() maybe need check when have get a distinct row // so when the cmp_res is get a distinct row, need check have output all rows num if (_get_enough_data()) { - output_block->set_columns(std::move(merged_columns)); + scoped_mutable_block.restore(); return Status::OK(); } *_previous_row = *current; @@ -199,7 +201,6 @@ Status PartitionSorter::_read_row_rank(Block* output_block, bool* eos, int batch } } - output_block->set_columns(std::move(merged_columns)); return Status::OK(); } diff --git a/be/src/exec/sort/sorter.cpp b/be/src/exec/sort/sorter.cpp index 616cc2145a2d16..686cca97e84e1c 100644 --- a/be/src/exec/sort/sorter.cpp +++ b/be/src/exec/sort/sorter.cpp @@ -114,7 +114,9 @@ void MergeSorterState::_merge_sort_read_impl(int batch_size, doris::Block* block size_t num_columns = unsorted_block()->columns(); - MutableBlock m_block = VectorizedUtils::build_mutable_mem_reuse_block(block, *unsorted_block()); + auto scoped_mutable_block = + VectorizedUtils::build_scoped_mutable_mem_reuse_block(block, *unsorted_block()); + auto& m_block = scoped_mutable_block.mutable_block(); MutableColumns& merged_columns = m_block.mutable_columns(); /// Take rows from queue in right order and push to 'merged'. @@ -143,7 +145,6 @@ void MergeSorterState::_merge_sort_read_impl(int batch_size, doris::Block* block } } - block->set_columns(std::move(merged_columns)); *eos = merged_rows == 0; } diff --git a/be/src/exec/sort/vsorted_run_merger.cpp b/be/src/exec/sort/vsorted_run_merger.cpp index b4c142cd4f1287..8323490031df06 100644 --- a/be/src/exec/sort/vsorted_run_merger.cpp +++ b/be/src/exec/sort/vsorted_run_merger.cpp @@ -150,8 +150,9 @@ Status VSortedRunMerger::get_next(Block* output_block, bool* eos) { return Status::OK(); } else { size_t num_columns = _priority_queue.top().impl->block->columns(); - MutableBlock m_block = VectorizedUtils::build_mutable_mem_reuse_block( + auto scoped_mutable_block = VectorizedUtils::build_scoped_mutable_mem_reuse_block( output_block, *_priority_queue.top().impl->block); + auto& m_block = scoped_mutable_block.mutable_block(); MutableColumns& merged_columns = m_block.mutable_columns(); if (num_columns != merged_columns.size()) { @@ -194,12 +195,12 @@ Status VSortedRunMerger::get_next(Block* output_block, bool* eos) { current->next(); if (_need_more_data(current)) { do_insert(); - output_block->set_columns(std::move(merged_columns)); + scoped_mutable_block.restore(); return Status::OK(); } } do_insert(); - output_block->set_columns(std::move(merged_columns)); + scoped_mutable_block.restore(); if (merged_rows == 0) { *eos = true; @@ -222,4 +223,4 @@ bool VSortedRunMerger::_need_more_data(MergeSortCursor& current) { } } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/exprs/function/cast/cast_base.h b/be/src/exprs/function/cast/cast_base.h index d759ead1260a72..cc58e29d4acd11 100644 --- a/be/src/exprs/function/cast/cast_base.h +++ b/be/src/exprs/function/cast/cast_base.h @@ -22,6 +22,8 @@ #include "core/assert_cast.h" #include "core/block/block.h" #include "core/call_on_type_index.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" #include "core/data_type/data_type.h" #include "core/data_type/data_type_array.h" #include "core/data_type/data_type_bitmap.h" @@ -98,6 +100,12 @@ constexpr static bool IsBaseCastFromType = IsBaseCastToType || IsStringTypecreate_column(), + ColumnUInt8::create()); +} + namespace CastWrapper { using WrapperType = std::function requires(IsDataTypeNumber) class CastToImpl : public CastToBase { Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - uint32_t result, size_t input_rows_count, + uint32_t result, size_t /*input_rows_count*/, const NullMap::value_type* null_map = nullptr) const override { const auto* col_from = assert_cast( block.get_by_position(arguments[0]).column.get()); auto to_type = block.get_by_position(result).type; - auto serde = remove_nullable(to_type)->get_serde(); - - // by default framework, to_type is already unwrapped nullable - MutableColumnPtr column_to = to_type->create_column(); - ColumnNullable::MutablePtr nullable_col_to = ColumnNullable::create( - std::move(column_to), ColumnUInt8::create(input_rows_count, 0)); + auto nested_to_type = remove_nullable(to_type); + auto serde = nested_to_type->get_serde(); DataTypeSerDe::FormatOptions format_options; format_options.converted_from_string = true; if constexpr (Mode == CastModeType::NonStrictMode) { + auto nullable_col_to = create_empty_nullable_column(nested_to_type); // may write nulls to nullable_col_to RETURN_IF_ERROR(serde->from_string_batch(*col_from, *nullable_col_to, format_options)); + block.get_by_position(result).column = std::move(nullable_col_to); } else if constexpr (Mode == CastModeType::StrictMode) { - // WON'T write nulls to nullable_col_to, just raise errors. null_map is only used to skip invalid rows - RETURN_IF_ERROR(serde->from_string_strict_mode_batch( - *col_from, nullable_col_to->get_nested_column(), format_options, null_map)); + MutableColumnPtr column_to = nested_to_type->create_column(); + // WON'T write nulls to the result column, just raise errors. null_map is only used to skip invalid rows + RETURN_IF_ERROR(serde->from_string_strict_mode_batch(*col_from, *column_to, + format_options, null_map)); + block.get_by_position(result).column = std::move(column_to); } else { return Status::InternalError("Unsupported cast mode"); } - block.get_by_position(result).column = std::move(nullable_col_to); return Status::OK(); } }; diff --git a/be/src/exprs/function/cast/cast_to_boolean.h b/be/src/exprs/function/cast/cast_to_boolean.h index a1a63522eb1a3e..7fda47712e582f 100644 --- a/be/src/exprs/function/cast/cast_to_boolean.h +++ b/be/src/exprs/function/cast/cast_to_boolean.h @@ -122,30 +122,29 @@ template class CastToImpl : public CastToBase { public: Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - uint32_t result, size_t input_rows_count, + uint32_t result, size_t /*input_rows_count*/, const NullMap::value_type* null_map = nullptr) const override { const auto* col_from = assert_cast( block.get_by_position(arguments[0]).column.get()); auto to_type = block.get_by_position(result).type; - auto serde = remove_nullable(to_type)->get_serde(); - - // by default framework, to_type is already unwrapped nullable - MutableColumnPtr column_to = to_type->create_column(); - ColumnNullable::MutablePtr nullable_col_to = ColumnNullable::create( - std::move(column_to), ColumnUInt8::create(input_rows_count, 0)); + auto nested_to_type = remove_nullable(to_type); + auto serde = nested_to_type->get_serde(); if constexpr (Mode == CastModeType::NonStrictMode) { + auto nullable_col_to = create_empty_nullable_column(nested_to_type); // may write nulls to nullable_col_to RETURN_IF_ERROR(serde->from_string_batch(*col_from, *nullable_col_to, {})); + block.get_by_position(result).column = std::move(nullable_col_to); } else if constexpr (Mode == CastModeType::StrictMode) { - // WON'T write nulls to nullable_col_to, just raise errors. null_map is only used to skip invalid rows - RETURN_IF_ERROR(serde->from_string_strict_mode_batch( - *col_from, nullable_col_to->get_nested_column(), {}, null_map)); + MutableColumnPtr column_to = nested_to_type->create_column(); + // WON'T write nulls to the result column, just raise errors. null_map is only used to skip invalid rows + RETURN_IF_ERROR( + serde->from_string_strict_mode_batch(*col_from, *column_to, {}, null_map)); + block.get_by_position(result).column = std::move(column_to); } else { return Status::InternalError("Unsupported cast mode"); } - block.get_by_position(result).column = std::move(nullable_col_to); return Status::OK(); } }; diff --git a/be/src/exprs/function/cast/cast_to_date.h b/be/src/exprs/function/cast/cast_to_date.h index 535de7e3d482aa..009739b0b0bbbf 100644 --- a/be/src/exprs/function/cast/cast_to_date.h +++ b/be/src/exprs/function/cast/cast_to_date.h @@ -47,32 +47,31 @@ template class CastToImpl : public CastToBase { public: Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - uint32_t result, size_t input_rows_count, + uint32_t result, size_t /*input_rows_count*/, const NullMap::value_type* null_map = nullptr) const override { const auto* col_from = assert_cast( block.get_by_position(arguments[0]).column.get()); auto to_type = block.get_by_position(result).type; - auto serde = remove_nullable(to_type)->get_serde(); + auto nested_to_type = remove_nullable(to_type); + auto serde = nested_to_type->get_serde(); DataTypeSerDe::FormatOptions options; options.timezone = &context->state()->timezone_obj(); - // by default framework, to_type is already unwrapped nullable - MutableColumnPtr column_to = to_type->create_column(); - ColumnNullable::MutablePtr nullable_col_to = ColumnNullable::create( - std::move(column_to), ColumnUInt8::create(input_rows_count, 0)); - if constexpr (CastMode == CastModeType::StrictMode) { - // WON'T write nulls to nullable_col_to, just raise errors. null_map is only used to skip invalid rows - RETURN_IF_ERROR(serde->from_string_strict_mode_batch( - *col_from, nullable_col_to->get_nested_column(), options, null_map)); + MutableColumnPtr column_to = nested_to_type->create_column(); + // WON'T write nulls to the result column, just raise errors. null_map is only used to skip invalid rows + RETURN_IF_ERROR( + serde->from_string_strict_mode_batch(*col_from, *column_to, options, null_map)); + block.get_by_position(result).column = std::move(column_to); } else { + auto nullable_col_to = create_empty_nullable_column(nested_to_type); // may write nulls to nullable_col_to RETURN_IF_ERROR(serde->from_string_batch(*col_from, *nullable_col_to, options)); + block.get_by_position(result).column = std::move(nullable_col_to); } - block.get_by_position(result).column = std::move(nullable_col_to); return Status::OK(); } }; @@ -82,36 +81,35 @@ template class CastToImpl : public CastToBase { public: Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - uint32_t result, size_t input_rows_count, + uint32_t result, size_t /*input_rows_count*/, const NullMap::value_type* null_map = nullptr) const override { const auto* col_from = assert_cast( block.get_by_position(arguments[0]).column.get()); auto to_type = block.get_by_position(result).type; + auto nested_to_type = remove_nullable(to_type); auto concrete_serde = std::dynamic_pointer_cast( - remove_nullable(to_type)->get_serde()); - - // by default framework, to_type is already unwrapped nullable - MutableColumnPtr column_to = to_type->create_column(); - ColumnNullable::MutablePtr nullable_col_to = ColumnNullable::create( - std::move(column_to), ColumnUInt8::create(input_rows_count, 0)); + nested_to_type->get_serde()); // datelike types serde must have template functions for those types. but because of they need to be // template functions, so we cannot make them virtual. that's why we assert_cast `serde` before. if constexpr (CastMode == CastModeType::StrictMode) { - // WON'T write nulls to nullable_col_to, just raise errors. null_map is only used to skip invalid rows + MutableColumnPtr column_to = nested_to_type->create_column(); + // WON'T write nulls to the result column, just raise errors. null_map is only used to skip invalid rows if constexpr (IsDataTypeInt) { RETURN_IF_ERROR(concrete_serde->template from_int_strict_mode_batch( - *col_from, nullable_col_to->get_nested_column())); + *col_from, *column_to)); } else if constexpr (IsDataTypeFloat) { RETURN_IF_ERROR(concrete_serde->template from_float_strict_mode_batch( - *col_from, nullable_col_to->get_nested_column())); + *col_from, *column_to)); } else { static_assert(IsDataTypeDecimal); RETURN_IF_ERROR( concrete_serde->template from_decimal_strict_mode_batch( - *col_from, nullable_col_to->get_nested_column())); + *col_from, *column_to)); } + block.get_by_position(result).column = std::move(column_to); } else { + auto nullable_col_to = create_empty_nullable_column(nested_to_type); // may write nulls to nullable_col_to if constexpr (IsDataTypeInt) { RETURN_IF_ERROR(concrete_serde->template from_int_batch( @@ -124,9 +122,9 @@ class CastToImpl : public CastToBase { RETURN_IF_ERROR(concrete_serde->template from_decimal_batch( *col_from, *nullable_col_to)); } + block.get_by_position(result).column = std::move(nullable_col_to); } - block.get_by_position(result).column = std::move(nullable_col_to); return Status::OK(); } }; diff --git a/be/src/exprs/function/cast/cast_to_decimal.h b/be/src/exprs/function/cast/cast_to_decimal.h index daec3a53a548d5..0617aca16ea014 100644 --- a/be/src/exprs/function/cast/cast_to_decimal.h +++ b/be/src/exprs/function/cast/cast_to_decimal.h @@ -684,31 +684,30 @@ template requires(IsDataTypeDecimal) class CastToImpl : public CastToBase { Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - uint32_t result, size_t input_rows_count, + uint32_t result, size_t /*input_rows_count*/, const NullMap::value_type* null_map = nullptr) const override { const auto* col_from = assert_cast( block.get_by_position(arguments[0]).column.get()); auto to_type = block.get_by_position(result).type; - auto serde = remove_nullable(to_type)->get_serde(); - - // by default framework, to_type is already unwrapped nullable - MutableColumnPtr column_to = to_type->create_column(); - ColumnNullable::MutablePtr nullable_col_to = ColumnNullable::create( - std::move(column_to), ColumnUInt8::create(input_rows_count, 0)); + auto nested_to_type = remove_nullable(to_type); + auto serde = nested_to_type->get_serde(); if constexpr (Mode == CastModeType::NonStrictMode) { + auto nullable_col_to = create_empty_nullable_column(nested_to_type); // may write nulls to nullable_col_to RETURN_IF_ERROR(serde->from_string_batch(*col_from, *nullable_col_to, {})); + block.get_by_position(result).column = std::move(nullable_col_to); } else if constexpr (Mode == CastModeType::StrictMode) { - // WON'T write nulls to nullable_col_to, just raise errors. null_map is only used to skip invalid rows - RETURN_IF_ERROR(serde->from_string_strict_mode_batch( - *col_from, nullable_col_to->get_nested_column(), {}, null_map)); + MutableColumnPtr column_to = nested_to_type->create_column(); + // WON'T write nulls to the result column, just raise errors. null_map is only used to skip invalid rows + RETURN_IF_ERROR( + serde->from_string_strict_mode_batch(*col_from, *column_to, {}, null_map)); + block.get_by_position(result).column = std::move(column_to); } else { return Status::InternalError("Unsupported cast mode"); } - block.get_by_position(result).column = std::move(nullable_col_to); return Status::OK(); } }; diff --git a/be/src/exprs/function/cast/cast_to_ip.h b/be/src/exprs/function/cast/cast_to_ip.h index 81eb1e798edc93..a585261e18b169 100644 --- a/be/src/exprs/function/cast/cast_to_ip.h +++ b/be/src/exprs/function/cast/cast_to_ip.h @@ -51,31 +51,30 @@ template class CastToImpl : public CastToBase { public: Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - uint32_t result, size_t input_rows_count, + uint32_t result, size_t /*input_rows_count*/, const NullMap::value_type* null_map = nullptr) const override { const auto* col_from = assert_cast( block.get_by_position(arguments[0]).column.get()); auto to_type = block.get_by_position(result).type; - auto serde = remove_nullable(to_type)->get_serde(); - - // by default framework, to_type is already unwrapped nullable - MutableColumnPtr column_to = to_type->create_column(); - ColumnNullable::MutablePtr nullable_col_to = ColumnNullable::create( - std::move(column_to), ColumnUInt8::create(input_rows_count, 0)); + auto nested_to_type = remove_nullable(to_type); + auto serde = nested_to_type->get_serde(); if constexpr (Mode == CastModeType::NonStrictMode) { + auto nullable_col_to = create_empty_nullable_column(nested_to_type); // may write nulls to nullable_col_to RETURN_IF_ERROR(serde->from_string_batch(*col_from, *nullable_col_to, {})); + block.get_by_position(result).column = std::move(nullable_col_to); } else if constexpr (Mode == CastModeType::StrictMode) { - // WON'T write nulls to nullable_col_to, just raise errors. null_map is only used to skip invalid rows - RETURN_IF_ERROR(serde->from_string_strict_mode_batch( - *col_from, nullable_col_to->get_nested_column(), {}, null_map)); + MutableColumnPtr column_to = nested_to_type->create_column(); + // WON'T write nulls to the result column, just raise errors. null_map is only used to skip invalid rows + RETURN_IF_ERROR( + serde->from_string_strict_mode_batch(*col_from, *column_to, {}, null_map)); + block.get_by_position(result).column = std::move(column_to); } else { return Status::InternalError("Unsupported cast mode"); } - block.get_by_position(result).column = std::move(nullable_col_to); return Status::OK(); } }; @@ -103,4 +102,4 @@ class CastToImpl : public CastToBase { return Status::OK(); } }; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/exprs/function/cast/cast_to_string.h b/be/src/exprs/function/cast/cast_to_string.h index 15879104a21076..c38b8ec90f920a 100644 --- a/be/src/exprs/function/cast/cast_to_string.h +++ b/be/src/exprs/function/cast/cast_to_string.h @@ -567,7 +567,14 @@ class CastToStringFunction { auto time_zone = cctz::utc_time_zone(); options.timezone = (context && context->state()) ? &context->state()->timezone_obj() : &time_zone; - type.get_serde()->to_string_batch(col_from, *col_to, options); + ColumnPtr limited_col; + const IColumn* col_to_serialize = &col_from; + if (col_from.size() != input_rows_count) { + DORIS_CHECK(col_from.size() >= input_rows_count); + limited_col = col_from.cut(0, input_rows_count); + col_to_serialize = limited_col.get(); + } + type.get_serde()->to_string_batch(*col_to_serialize, *col_to, options); block.replace_by_position(result, std::move(col_to)); return Status::OK(); diff --git a/be/src/format/arrow/arrow_stream_reader.cpp b/be/src/format/arrow/arrow_stream_reader.cpp index d5b53dff3306e5..7d496d803a6248 100644 --- a/be/src/format/arrow/arrow_stream_reader.cpp +++ b/be/src/format/arrow/arrow_stream_reader.cpp @@ -94,7 +94,8 @@ Status ArrowStreamReader::_do_get_next_block(Block* block, size_t* read_rows, bo std::move(res_reader).ValueUnsafe(); // convert arrow batch to block - auto columns = block->mutate_columns(); + auto columns_guard = block->mutate_columns_scoped(); + auto& columns = columns_guard.mutable_columns(); size_t batch_size = out_batches.size(); for (size_t i = 0; i < batch_size; i++) { arrow::RecordBatch& batch = *out_batches[i]; @@ -105,15 +106,17 @@ Status ArrowStreamReader::_do_get_next_block(Block* block, size_t* read_rows, bo std::string column_name = batch.schema()->field(c)->name(); try { - const ColumnWithTypeAndName& column_with_name = block->safe_get_by_position(c); + const auto& column_name_in_block = columns_guard.get_name_by_position(c); - if (column_with_name.name != column_name) { + if (column_name_in_block != column_name) { return Status::InternalError("Column name mismatch: expected {}, got {}", - column_with_name.name, column_name); + column_name_in_block, column_name); } - RETURN_IF_ERROR(column_with_name.type->get_serde()->read_column_from_arrow( - *columns[c], column, 0, num_rows, _ctzz)); + RETURN_IF_ERROR( + columns_guard.get_datatype_by_position(c) + ->get_serde() + ->read_column_from_arrow(*columns[c], column, 0, num_rows, _ctzz)); } catch (Exception& e) { return Status::InternalError("Failed to convert from arrow to block: {}", e.what()); } @@ -121,7 +124,6 @@ Status ArrowStreamReader::_do_get_next_block(Block* block, size_t* read_rows, bo *read_rows += batch.num_rows(); } - block->set_columns(std::move(columns)); *eof = (*read_rows == 0); return Status::OK(); } diff --git a/be/src/format/column_type_convert.cpp b/be/src/format/column_type_convert.cpp index b7a8388b5be771..08fe5c8a4ce794 100644 --- a/be/src/format/column_type_convert.cpp +++ b/be/src/format/column_type_convert.cpp @@ -18,6 +18,7 @@ #include "format/column_type_convert.h" #include "common/cast_set.h" +#include "core/column/column_nullable.h" #include "core/data_type/define_primitive_type.h" namespace doris::converter { @@ -111,19 +112,12 @@ ColumnPtr ColumnTypeConverter::get_column(const DataTypePtr& src_type, ColumnPtr _cached_src_type = dst_type->is_nullable() ? get_data_type_with_default_argument(make_nullable(src_type)) : get_data_type_with_default_argument(remove_nullable(src_type)); - _cached_src_column = remove_nullable(_cached_src_type)->create_column(); + _cached_src_column = _cached_src_type->create_column(); } // remove the old cached data - _cached_src_column->assume_mutable()->clear(); - - if (dst_type->is_nullable()) { - // Seed the source nullable wrapper with the destination's current null map. Under the - // assert-mutability COW contract ColumnNullable::create() mutates/clones the subcolumns, so - // readers that append file nulls must copy back only the newly appended null-map slice. - const auto* doris_nullable_column = static_cast(dst_column.get()); - return ColumnNullable::create(_cached_src_column, - doris_nullable_column->get_null_map_column_ptr()); - } + auto cached_src_column = IColumn::mutate(std::move(_cached_src_column)); + cached_src_column->clear(); + _cached_src_column = std::move(cached_src_column); return _cached_src_column; } diff --git a/be/src/format/column_type_convert.h b/be/src/format/column_type_convert.h index 554e5a0c3662a2..700fdd2ac1cff4 100644 --- a/be/src/format/column_type_convert.h +++ b/be/src/format/column_type_convert.h @@ -123,8 +123,9 @@ class ColumnTypeConverter { /** * Get the column to read data from file with the type from file meta data. * If the converter is not consistent, the returned column is `_cached_src_column`. - * For performance reasons, the null map of `_cached_src_column` is a reference from - * the null map of `dst_column`, so there is no need to convert null map in `convert()`. + * For nullable destination columns, `_cached_src_column` is also nullable and owns its + * temporary null map. The reader fills this source null map first, then copies only the + * newly appended null slice back to the destination column before value conversion. * * According to the hive standard, if certain values fail to be converted(eg. string `row1` to int value), * these values are replaced by nulls. diff --git a/be/src/format/count_reader.h b/be/src/format/count_reader.h index 4b6e55337b23ac..0c4cd791e8542b 100644 --- a/be/src/format/count_reader.h +++ b/be/src/format/count_reader.h @@ -58,11 +58,11 @@ class CountReader : public GenericReader { auto rows = std::min(_remaining_rows, static_cast(_batch_size)); _remaining_rows -= rows; - auto mutate_columns = block->mutate_columns(); + auto mutable_columns_guard = block->mutate_columns_scoped(); + auto& mutate_columns = mutable_columns_guard.mutable_columns(); for (auto& col : mutate_columns) { col->resize(rows); } - block->set_columns(std::move(mutate_columns)); *read_rows = rows; *eof = (_remaining_rows == 0); diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index 90340afafe1739..266f569acbe9ae 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -443,13 +443,14 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) RETURN_IF_ERROR(_validate_line(Slice(ptr, size), &success)); ++rows; } - auto mutate_columns = block->mutate_columns(); + auto mutable_columns_guard = block->mutate_columns_scoped(); + auto& mutate_columns = mutable_columns_guard.mutable_columns(); for (auto& col : mutate_columns) { col->resize(rows); } - block->set_columns(std::move(mutate_columns)); } else { - auto columns = block->mutate_columns(); + auto columns_guard = block->mutate_columns_scoped(); + auto& columns = columns_guard.mutable_columns(); while (rows < batch_size && !_line_reader_eof && (columns_byte_size(columns) < max_block_bytes)) { const uint8_t* ptr = nullptr; @@ -483,7 +484,6 @@ Status CsvReader::_do_get_next_block(Block* block, size_t* read_rows, bool* eof) } RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), columns, &rows)); } - block->set_columns(std::move(columns)); } *eof = (rows == 0); diff --git a/be/src/format/lance/lance_rust_reader.cpp b/be/src/format/lance/lance_rust_reader.cpp index 2eed2356734ca3..092ce7211e90e5 100644 --- a/be/src/format/lance/lance_rust_reader.cpp +++ b/be/src/format/lance/lance_rust_reader.cpp @@ -230,7 +230,8 @@ Status LanceRustReader::_do_get_next_block(Block* block, size_t* read_rows, bool const auto num_columns = record_batch->num_columns(); // Convert Arrow columns to Doris Block columns (same pattern as PaimonCppReader) - auto columns = block->mutate_columns(); + auto columns_guard = block->mutate_columns_scoped(); + auto& columns = columns_guard.mutable_columns(); for (int c = 0; c < num_columns; ++c) { const auto& field = record_batch->schema()->field(c); @@ -240,16 +241,17 @@ Status LanceRustReader::_do_get_next_block(Block* block, size_t* read_rows, bool } const auto block_pos = it->second; - const ColumnWithTypeAndName& column_with_name = block->get_by_position(block_pos); try { - RETURN_IF_ERROR(column_with_name.type->get_serde()->read_column_from_arrow( - *columns[block_pos], record_batch->column(c).get(), 0, num_rows, _ctzz)); + RETURN_IF_ERROR(columns_guard.get_datatype_by_position(block_pos) + ->get_serde() + ->read_column_from_arrow(*columns[block_pos], + record_batch->column(c).get(), 0, + num_rows, _ctzz)); } catch (Exception& e) { return Status::InternalError("Failed to convert Lance arrow to block: {}", e.what()); } } - block->set_columns(std::move(columns)); *read_rows = num_rows; *eof = false; return Status::OK(); diff --git a/be/src/format/orc/vorc_reader.cpp b/be/src/format/orc/vorc_reader.cpp index afc8b09933f9dc..70c4d4affaf44e 100644 --- a/be/src/format/orc/vorc_reader.cpp +++ b/be/src/format/orc/vorc_reader.cpp @@ -2857,9 +2857,8 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s if (_lazy_read_ctx.resize_first_column) { // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0 // The following process may be tricky and time-consuming, but we have no other way. - auto column = IColumn::mutate(std::move(block->get_by_position(0).column)); - column->resize(size); - block->replace_by_position(0, std::move(column)); + auto column_guard = block->mutate_column_scoped(0); + column_guard.mutable_column()->resize(size); } // transactional hive orc delete row diff --git a/be/src/format/orc/vorc_reader.h b/be/src/format/orc/vorc_reader.h index 91c8ffa4a0684c..6d9f74ae4a0ace 100644 --- a/be/src/format/orc/vorc_reader.h +++ b/be/src/format/orc/vorc_reader.h @@ -226,7 +226,8 @@ class OrcReader : public TableFormatReader, public RowPositionProvider { if (col_pos < 0) { return Status::InternalError("Column {} not found in block", col_name); } - auto col = block->get_by_position(col_pos).column->assume_mutable(); + auto column_guard = block->mutate_column_scoped(col_pos); + auto& col = column_guard.mutable_column(); const auto& row_ids = this->current_batch_row_positions(); RETURN_IF_ERROR( _row_id_column_iterator->read_by_rowids(row_ids.data(), row_ids.size(), col)); diff --git a/be/src/format/parquet/fix_length_plain_decoder.h b/be/src/format/parquet/fix_length_plain_decoder.h index 1628b8c6d05c98..0c0c47197fac21 100644 --- a/be/src/format/parquet/fix_length_plain_decoder.h +++ b/be/src/format/parquet/fix_length_plain_decoder.h @@ -20,6 +20,7 @@ #include #include "common/status.h" +#include "core/column/column_fixed_length_object.h" #include "core/data_type/data_type.h" #include "format/parquet/decoder.h" #include "format/parquet/parquet_common.h" @@ -46,7 +47,13 @@ class FixLengthPlainDecoder final : public Decoder { return Status::IOError("Out-of-bounds access in parquet data decoder"); } - size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); + size_t primitive_length = _type_length; + if (const auto* fixed_length_column = + check_and_get_column(*doris_column)) { + DCHECK_EQ(fixed_length_column->item_size(), _type_length); + } else { + primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); + } size_t data_index = doris_column->size() * primitive_length; size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) * (_type_length / primitive_length); diff --git a/be/src/format/parquet/parquet_column_convert.cpp b/be/src/format/parquet/parquet_column_convert.cpp index 940e95bd973306..8f8a6cc34fcb96 100644 --- a/be/src/format/parquet/parquet_column_convert.cpp +++ b/be/src/format/parquet/parquet_column_convert.cpp @@ -21,7 +21,9 @@ #include #include "common/cast_set.h" +#include "core/column/column_fixed_length_object.h" #include "core/column/column_nullable.h" +#include "core/data_type/data_type_fixed_length_object.h" #include "core/data_type/data_type_nullable.h" #include "core/data_type/define_primitive_type.h" #include "core/data_type/primitive_type.h" @@ -106,28 +108,38 @@ ColumnPtr PhysicalToLogicalConverter::get_physical_column(tparquet::Type::type s _cached_src_physical_type = std::make_shared(); break; case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: - _cached_src_physical_type = std::make_shared(); + _cached_src_physical_type = std::make_shared(); break; case tparquet::Type::type::INT96: _cached_src_physical_type = std::make_shared(); break; } - _cached_src_physical_column = _cached_src_physical_type->create_column(); + const bool is_fixed_length_byte_array = + src_physical_type == tparquet::Type::type::FIXED_LEN_BYTE_ARRAY; if (dst_logical_type->is_nullable()) { + MutableColumnPtr nested_physical_column; + if (is_fixed_length_byte_array) { + nested_physical_column = ColumnFixedLengthObject::create( + _convert_params->field_schema->parquet_schema.type_length); + } else { + nested_physical_column = _cached_src_physical_type->create_column(); + } + _cached_src_physical_column = ColumnNullable::create(std::move(nested_physical_column), + ColumnUInt8::create()); _cached_src_physical_type = make_nullable(_cached_src_physical_type); + } else { + if (is_fixed_length_byte_array) { + _cached_src_physical_column = ColumnFixedLengthObject::create( + _convert_params->field_schema->parquet_schema.type_length); + } else { + _cached_src_physical_column = _cached_src_physical_type->create_column(); + } } } // remove the old cached data - _cached_src_physical_column->assume_mutable()->clear(); - - if (dst_logical_type->is_nullable()) { - // In order to share null map between parquet converted src column and dst column to avoid copying. It is very tricky that will - // call mutable function `doris_nullable_column->get_null_map_column_ptr()` which will set `_need_update_has_null = true`. - // Because some operations such as agg will call `has_null()` to set `_need_update_has_null = false`. - auto* doris_nullable_column = assert_cast(dst_logical_column.get()); - return ColumnNullable::create(_cached_src_physical_column, - doris_nullable_column->get_null_map_column_ptr()); - } + auto cached_src_physical_column = IColumn::mutate(std::move(_cached_src_physical_column)); + cached_src_physical_column->clear(); + _cached_src_physical_column = std::move(cached_src_physical_column); return _cached_src_physical_column; } diff --git a/be/src/format/parquet/parquet_column_convert.h b/be/src/format/parquet/parquet_column_convert.h index fab6e5e98bf60e..9206ea285ac101 100644 --- a/be/src/format/parquet/parquet_column_convert.h +++ b/be/src/format/parquet/parquet_column_convert.h @@ -25,7 +25,9 @@ #include #include "common/cast_set.h" +#include "core/column/column_fixed_length_object.h" #include "core/column/column_varbinary.h" +#include "core/column/column_vector.h" #include "core/data_type/data_type_factory.hpp" #include "core/data_type/primitive_type.h" #include "core/extended_types.h" @@ -253,6 +255,25 @@ inline void align_null_map(ColumnPtr& src_column, ColumnPtr& dst_column, size_t } } +struct FixedLengthPhysicalData { + const uint8_t* data = nullptr; + size_t byte_size = 0; + size_t rows = 0; +}; + +inline FixedLengthPhysicalData get_fixed_length_physical_data(const IColumn& column, + size_t type_length) { + if (const auto* fixed_length_column = check_and_get_column(column)) { + DCHECK_EQ(fixed_length_column->item_size(), type_length); + return {fixed_length_column->get_data().data(), fixed_length_column->byte_size(), + fixed_length_column->size()}; + } + + const auto& uint8_column = assert_cast(column); + DCHECK_EQ(uint8_column.size() % type_length, 0); + return {uint8_column.get_data().data(), uint8_column.size(), uint8_column.size() / type_length}; +} + /** * Convert parquet physical column to logical column * In parquet document(https://github.com/apache/parquet-format/blob/master/LogicalTypes.md), @@ -272,11 +293,12 @@ inline void align_null_map(ColumnPtr& src_column, ColumnPtr& dst_column, size_t * Ultimate performance optimization: * 1. If process of (First => Second) is consistent, eg. from BYTE_ARRAY to string, no additional copies and conversions will be introduced; * 2. If process of (Second => Third) is consistent, no additional copies and conversions will be introduced; - * 3. Null map is share among all processes, no additional copies and conversions will be introduced in null map; + * 3. Null maps are owned by each temporary nullable column, and only appended null slices are + * copied between conversion stages; * 4. Only create one physical column in physical conversion, and reused in each loop; * 5. Only create one logical column in logical conversion, and reused in each loop; - * 6. FIXED_LENGTH_BYTE_ARRAY is read as ColumnUInt8 instead of ColumnString, so the underlying decoder has no process to decode string - * and use memory copy to read the data as a whole, and the conversion has no need to resolve the Offsets in ColumnString. + * 6. FIXED_LENGTH_BYTE_ARRAY is read as ColumnFixedLengthObject instead of ColumnString, so + * the decoder can copy fixed-size values as a whole while keeping nullable row counts valid. */ class PhysicalToLogicalConverter { protected: @@ -491,16 +513,16 @@ class FixedSizeBinaryConverter : public PhysicalToLogicalConverter { ColumnPtr from_col = remove_nullable(src_physical_col); IColumn* to_col = get_mutable_inner_column(src_logical_column); - auto* src_data = assert_cast(from_col.get()); - size_t length = src_data->size(); - size_t num_values = length / _type_length; + const auto src_data = get_fixed_length_physical_data(*from_col, _type_length); + size_t length = src_data.byte_size; + size_t num_values = src_data.rows; auto& string_col = static_cast(*to_col); auto& offsets = string_col.get_offsets(); auto& chars = string_col.get_chars(); size_t origin_size = chars.size(); chars.resize(origin_size + length); - memcpy(chars.data() + origin_size, src_data->get_data().data(), length); + memcpy(chars.data() + origin_size, src_data.data, length); origin_size = offsets.size(); offsets.resize(origin_size + num_values); @@ -527,14 +549,13 @@ class Float16PhysicalConverter : public PhysicalToLogicalConverter { ColumnPtr from_col = remove_nullable(src_physical_col); IColumn* to_col = get_mutable_inner_column(src_logical_column); - const auto* src_data = assert_cast(from_col.get()); - size_t length = src_data->size(); - size_t num_values = length / _type_length; + const auto src_data = get_fixed_length_physical_data(*from_col, _type_length); + size_t num_values = src_data.rows; auto* to_float_column = assert_cast(to_col); size_t start_idx = to_float_column->size(); to_float_column->resize(start_idx + num_values); auto& to_float_column_data = to_float_column->get_data(); - const auto* ptr = src_data->get_data().data(); + const auto* ptr = src_data.data; for (int i = 0; i < num_values; ++i) { size_t offset = i * _type_length; const auto* data_ptr = ptr + offset; @@ -604,19 +625,13 @@ class UUIDVarBinaryConverter : public PhysicalToLogicalConverter { Status physical_convert(ColumnPtr& src_physical_col, ColumnPtr& src_logical_column) override { DCHECK(!is_column_const(*src_physical_col)) << src_physical_col->dump_structure(); DCHECK(!is_column_const(*src_logical_column)) << src_logical_column->dump_structure(); - const ColumnUInt8* uint8_col = nullptr; - if (is_column_nullable(*src_physical_col)) { - const auto& nullable = assert_cast(src_physical_col.get()); - uint8_col = &assert_cast(nullable->get_nested_column()); - } else { - uint8_col = &assert_cast(*src_physical_col); - } + const ColumnPtr from_col = remove_nullable(src_physical_col); + const auto src_data = get_fixed_length_physical_data(*from_col, _type_length); IColumn* to_col = get_mutable_inner_column(src_logical_column); auto* to_varbinary_column = assert_cast(to_col); - size_t length = uint8_col->size(); - size_t num_values = length / _type_length; - const auto* ptr = uint8_col->get_data().data(); + size_t num_values = src_data.rows; + const auto* ptr = src_data.data; for (int i = 0; i < num_values; ++i) { auto offset = i * _type_length; @@ -690,8 +705,9 @@ class FixedSizeToDecimal : public PhysicalToLogicalConverter { template Status _convert_internal(ColumnPtr& src_col, IColumn* dst_col) { - size_t rows = src_col->size() / fixed_type_length; - auto* buf = static_cast(src_col.get())->get_data().data(); + const auto src_data = get_fixed_length_physical_data(*src_col, fixed_type_length); + size_t rows = src_data.rows; + const auto* buf = src_data.data; size_t start_idx = dst_col->size(); dst_col->resize(start_idx + rows); diff --git a/be/src/format/parquet/vparquet_group_reader.cpp b/be/src/format/parquet/vparquet_group_reader.cpp index f2db75afd6d1b0..a346fa91585d6e 100644 --- a/be/src/format/parquet/vparquet_group_reader.cpp +++ b/be/src/format/parquet/vparquet_group_reader.cpp @@ -667,9 +667,8 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re if (_lazy_read_ctx.resize_first_column) { // VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0 // The following process may be tricky and time-consuming, but we have no other way. - auto column = IColumn::mutate(std::move(block->get_by_position(0).column)); - column->resize(pre_read_rows); - block->replace_by_position(0, std::move(column)); + auto column_guard = block->mutate_column_scoped(0); + column_guard.mutable_column()->resize(pre_read_rows); } result_filter.assign(pre_read_rows, static_cast(1)); std::vector filters; @@ -899,7 +898,8 @@ Status RowGroupReader::_fill_missing_columns( RETURN_IF_ERROR(_get_block_column_pos(*block, kv.first, &block_pos)); if (kv.second == nullptr) { // no default column, fill with null - auto mutable_column = block->get_by_position(block_pos).column->assume_mutable(); + auto column_guard = block->mutate_column_scoped(block_pos); + auto& mutable_column = column_guard.mutable_column(); auto* nullable_column = assert_cast(mutable_column.get()); nullable_column->insert_many_defaults(rows); } else { diff --git a/be/src/format/parquet/vparquet_reader.cpp b/be/src/format/parquet/vparquet_reader.cpp index 2565b254338321..8485d9e9d2a173 100644 --- a/be/src/format/parquet/vparquet_reader.cpp +++ b/be/src/format/parquet/vparquet_reader.cpp @@ -297,6 +297,9 @@ Status ParquetReader::close() { void ParquetReader::_close_internal() { if (!_closed) { + _current_group_reader.reset(); + _tracing_file_reader.reset(); + _file_reader.reset(); _closed = true; } } diff --git a/be/src/format/parquet/vparquet_reader.h b/be/src/format/parquet/vparquet_reader.h index 68979bf9e4f027..cb6a1d21c7335c 100644 --- a/be/src/format/parquet/vparquet_reader.h +++ b/be/src/format/parquet/vparquet_reader.h @@ -187,7 +187,8 @@ class ParquetReader : public TableFormatReader { if (col_pos < 0) { return Status::InternalError("Column {} not found in block", col_name); } - auto col = block->get_by_position(col_pos).column->assume_mutable(); + auto column_guard = block->mutate_column_scoped(col_pos); + auto& col = column_guard.mutable_column(); const auto& row_ids = this->current_batch_row_positions(); RETURN_IF_ERROR( _row_id_column_iterator->read_by_rowids(row_ids.data(), row_ids.size(), col)); diff --git a/be/src/format/table/es/es_http_reader.cpp b/be/src/format/table/es/es_http_reader.cpp index 24cc4410b63ed1..0efb3c19e75add 100644 --- a/be/src/format/table/es/es_http_reader.cpp +++ b/be/src/format/table/es/es_http_reader.cpp @@ -147,11 +147,8 @@ Status EsHttpReader::_do_get_next_block(Block* block, size_t* read_rows, bool* e return Status::OK(); } - auto column_size = _tuple_desc->slots().size(); - std::vector columns(column_size); - for (size_t i = 0; i < column_size; i++) { - columns[i] = block->get_by_position(i).column->assume_mutable(); - } + auto columns_guard = block->mutate_columns_scoped(); + auto& columns = columns_guard.mutable_columns(); size_t rows_before = columns[0]->size(); const int batch_size = _state->batch_size(); diff --git a/be/src/format/table/iceberg_reader_mixin.h b/be/src/format/table/iceberg_reader_mixin.h index c02cecfb0430b0..50f29095e257af 100644 --- a/be/src/format/table/iceberg_reader_mixin.h +++ b/be/src/format/table/iceberg_reader_mixin.h @@ -159,8 +159,9 @@ class IcebergReaderMixin : public BaseReader, public TableSchemaChangeHelper { DORIS_CHECK(col_pos >= 0); if (_row_lineage_columns.first_row_id >= 0) { - auto col = block->get_by_position(col_pos).column->assume_mutable(); - auto* nullable_column = assert_cast(col.get()); + auto column_guard = block->mutate_column_scoped(col_pos); + auto* nullable_column = + assert_cast(column_guard.mutable_column().get()); auto& null_map = nullable_column->get_null_map_data(); auto& data = assert_cast(*nullable_column->get_nested_column_ptr()).get_data(); @@ -180,8 +181,9 @@ class IcebergReaderMixin : public BaseReader, public TableSchemaChangeHelper { DORIS_CHECK(col_pos >= 0); if (_row_lineage_columns.last_updated_sequence_number >= 0) { - auto col = block->get_by_position(col_pos).column->assume_mutable(); - auto* nullable_column = assert_cast(col.get()); + auto column_guard = block->mutate_column_scoped(col_pos); + auto* nullable_column = + assert_cast(column_guard.mutable_column().get()); auto& null_map = nullable_column->get_null_map_data(); auto& data = assert_cast(*nullable_column->get_nested_column_ptr()).get_data(); @@ -552,9 +554,9 @@ Status IcebergReaderMixin::_equality_delete_base( return st; } if (read_rows > 0) { - MutableBlock mutable_block(&eq_file_block); + ScopedMutableBlock scoped_mutable_block(&eq_file_block); + auto& mutable_block = scoped_mutable_block.mutable_block(); RETURN_IF_ERROR(mutable_block.merge(tmp_block)); - eq_file_block = mutable_block.to_block(); } } } diff --git a/be/src/format/table/paimon_cpp_reader.cpp b/be/src/format/table/paimon_cpp_reader.cpp index e628c30af737ba..c7454f043c9062 100644 --- a/be/src/format/table/paimon_cpp_reader.cpp +++ b/be/src/format/table/paimon_cpp_reader.cpp @@ -74,11 +74,11 @@ Status PaimonCppReader::_do_get_next_block(Block* block, size_t* read_rows, bool auto rows = std::min(_remaining_table_level_row_count, (int64_t)_state->query_options().batch_size); _remaining_table_level_row_count -= rows; - auto mutate_columns = block->mutate_columns(); + auto mutable_columns_guard = block->mutate_columns_scoped(); + auto& mutate_columns = mutable_columns_guard.mutable_columns(); for (auto& col : mutate_columns) { col->resize(rows); } - block->set_columns(std::move(mutate_columns)); *read_rows = rows; *eof = false; if (_remaining_table_level_row_count == 0) { @@ -117,7 +117,8 @@ Status PaimonCppReader::_do_get_next_block(Block* block, size_t* read_rows, bool auto record_batch = std::move(import_result).ValueUnsafe(); const auto num_rows = static_cast(record_batch->num_rows()); const auto num_columns = record_batch->num_columns(); - auto columns = block->mutate_columns(); + auto columns_guard = block->mutate_columns_scoped(); + auto& columns = columns_guard.mutable_columns(); for (int c = 0; c < num_columns; ++c) { const auto& field = record_batch->schema()->field(c); if (field->name() == VALUE_KIND_FIELD) { @@ -130,16 +131,17 @@ Status PaimonCppReader::_do_get_next_block(Block* block, size_t* read_rows, bool continue; } const auto block_pos = it->second; - const ColumnWithTypeAndName& column_with_name = block->get_by_position(block_pos); try { - RETURN_IF_ERROR(column_with_name.type->get_serde()->read_column_from_arrow( - *columns[block_pos], record_batch->column(c).get(), 0, num_rows, _ctzz)); + RETURN_IF_ERROR(columns_guard.get_datatype_by_position(block_pos) + ->get_serde() + ->read_column_from_arrow(*columns[block_pos], + record_batch->column(c).get(), 0, + num_rows, _ctzz)); } catch (Exception& e) { return Status::InternalError("Failed to convert from arrow to block: {}", e.what()); } } - block->set_columns(std::move(columns)); *read_rows = num_rows; *eof = false; return Status::OK(); diff --git a/be/src/format/table/paimon_jni_reader.cpp b/be/src/format/table/paimon_jni_reader.cpp index f50e59deb7ee3d..c2d43406504b73 100644 --- a/be/src/format/table/paimon_jni_reader.cpp +++ b/be/src/format/table/paimon_jni_reader.cpp @@ -105,11 +105,11 @@ Status PaimonJniReader::_do_get_next_block(Block* block, size_t* read_rows, bool auto rows = std::min(_remaining_table_level_row_count, (int64_t)_state->query_options().batch_size); _remaining_table_level_row_count -= rows; - auto mutate_columns = block->mutate_columns(); + auto mutable_columns_guard = block->mutate_columns_scoped(); + auto& mutate_columns = mutable_columns_guard.mutable_columns(); for (auto& col : mutate_columns) { col->resize(rows); } - block->set_columns(std::move(mutate_columns)); *read_rows = rows; if (_remaining_table_level_row_count == 0) { *eof = true; diff --git a/be/src/format/table/parquet_metadata_reader.cpp b/be/src/format/table/parquet_metadata_reader.cpp index 054bd1929a2e16..c1e9b3372e261d 100644 --- a/be/src/format/table/parquet_metadata_reader.cpp +++ b/be/src/format/table/parquet_metadata_reader.cpp @@ -808,32 +808,31 @@ Status ParquetMetadataReader::_do_get_next_block(Block* block, size_t* read_rows // Scanner may call multiple times; we surface data once and mark eof on the next call. // When reusing a Block, wipe row data but keep column structure intact. - bool mem_reuse = block->mem_reuse(); - std::vector columns(_slots.size()); - if (mem_reuse) { - for (size_t i = 0; i < _slots.size(); ++i) { - columns[i] = IColumn::mutate(std::move(block->get_by_position(i).column)); - columns[i]->clear(); - } - } else { + const bool mem_reuse = block->mem_reuse(); + size_t produced = 0; + if (!mem_reuse) { + std::vector columns(_slots.size()); for (size_t i = 0; i < _slots.size(); ++i) { columns[i] = _slots[i]->get_empty_mutable_column(); } - } - size_t rows_before = block->rows(); - RETURN_IF_ERROR(_build_rows(columns)); - - if (!mem_reuse) { + RETURN_IF_ERROR(_build_rows(columns)); for (size_t i = 0; i < _slots.size(); ++i) { block->insert(ColumnWithTypeAndName( std::move(columns[i]), _slots[i]->get_data_type_ptr(), _slots[i]->col_name())); } + produced = block->rows(); } else { - block->set_columns(std::move(columns)); + auto columns_guard = block->mutate_columns_scoped(); + auto& columns = columns_guard.mutable_columns(); + for (size_t i = 0; i < _slots.size(); ++i) { + columns[i]->clear(); + } + + RETURN_IF_ERROR(_build_rows(columns)); + produced = columns[0]->size(); } - size_t produced = block->rows() - rows_before; *read_rows = produced; _eof = true; *eof = (produced == 0); diff --git a/be/src/format/table/remote_doris_reader.cpp b/be/src/format/table/remote_doris_reader.cpp index 487aad2869b90d..0e2184d65b62f5 100644 --- a/be/src/format/table/remote_doris_reader.cpp +++ b/be/src/format/table/remote_doris_reader.cpp @@ -72,21 +72,24 @@ Status RemoteDorisReader::_do_get_next_block(Block* block, size_t* read_rows, bo auto batch = chunk.data; auto num_rows = batch->num_rows(); auto num_columns = batch->num_columns(); - auto columns = block->mutate_columns(); + const auto block_structure = block->dump_structure(); + auto columns_guard = block->mutate_columns_scoped(); + auto& columns = columns_guard.mutable_columns(); for (int c = 0; c < num_columns; ++c) { arrow::Array* column = batch->column(c).get(); std::string column_name = batch->schema()->field(c)->name(); if (!_col_name_to_block_idx->contains(column_name)) { return Status::InternalError("column {} not found in block {}", column_name, - block->dump_structure()); + block_structure); } try { auto block_pos = (*_col_name_to_block_idx)[column_name]; - const ColumnWithTypeAndName& column_with_name = block->get_by_position(block_pos); - RETURN_IF_ERROR(column_with_name.type->get_serde()->read_column_from_arrow( - *columns[block_pos], column, 0, num_rows, _ctzz)); + RETURN_IF_ERROR(columns_guard.get_datatype_by_position(block_pos) + ->get_serde() + ->read_column_from_arrow(*columns[block_pos], column, 0, + num_rows, _ctzz)); } catch (Exception& e) { return Status::InternalError( "Failed to convert from arrow to block, column_name: {}, e: {}", column_name, @@ -94,7 +97,6 @@ Status RemoteDorisReader::_do_get_next_block(Block* block, size_t* read_rows, bo } } - block->set_columns(std::move(columns)); *read_rows += num_rows; return Status::OK(); diff --git a/be/src/format/table/table_format_reader.h b/be/src/format/table/table_format_reader.h index 9beff637b98533..ed5b414aa44dc1 100644 --- a/be/src/format/table/table_format_reader.h +++ b/be/src/format/table/table_format_reader.h @@ -67,8 +67,9 @@ class TableFormatReader : public GenericReader { if (it == _fill_partition_values.end()) { continue; } - auto col_ptr = block->get_by_position((*_fill_col_name_to_block_idx)[col_name]) - .column->assume_mutable(); + auto column_guard = + block->mutate_column_scoped((*_fill_col_name_to_block_idx)[col_name]); + auto& col_ptr = column_guard.mutable_column(); const auto& [value, slot_desc] = it->second; auto text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); @@ -101,9 +102,9 @@ class TableFormatReader : public GenericReader { VExprContextSPtr ctx = (it != _fill_missing_defaults.end()) ? it->second : nullptr; if (ctx == nullptr) { - auto mutable_column = - block->get_by_position((*_fill_col_name_to_block_idx)[col_name]) - .column->assume_mutable(); + auto column_guard = + block->mutate_column_scoped((*_fill_col_name_to_block_idx)[col_name]); + auto& mutable_column = column_guard.mutable_column(); auto* nullable_column = static_cast(mutable_column.get()); nullable_column->insert_many_defaults(rows); } else { diff --git a/be/src/format/transformer/merge_partitioner.cpp b/be/src/format/transformer/merge_partitioner.cpp index f486c85476f0fc..89cf830d6bba53 100644 --- a/be/src/format/transformer/merge_partitioner.cpp +++ b/be/src/format/transformer/merge_partitioner.cpp @@ -210,7 +210,8 @@ Status MergePartitioner::do_partitioning(RuntimeState* state, Block* block) cons block->replace_by_position_if_const(col_idx); } - MutableColumns mutable_columns = block->mutate_columns(); + auto mutable_columns_guard = block->mutate_columns_scoped(); + MutableColumns& mutable_columns = mutable_columns_guard.mutable_columns(); MutableColumnPtr& op_mut = mutable_columns[op_idx]; ColumnInt8* op_values_col = nullptr; if (auto* nullable_col = check_and_get_column(op_mut.get())) { @@ -220,7 +221,6 @@ Status MergePartitioner::do_partitioning(RuntimeState* state, Block* block) cons op_values_col = check_and_get_column(op_mut.get()); } if (op_values_col == nullptr) { - block->set_columns(std::move(mutable_columns)); return Status::InternalError("Merge operation column must be tinyint"); } auto& op_values = op_values_col->get_data(); @@ -252,7 +252,6 @@ Status MergePartitioner::do_partitioning(RuntimeState* state, Block* block) cons _insert_random ? _next_rr_channel() : insert_hashes[row]; _channel_ids.push_back(insert_channel); } - block->set_columns(std::move(mutable_columns)); } return Status::OK(); diff --git a/be/src/information_schema/schema_active_queries_scanner.cpp b/be/src/information_schema/schema_active_queries_scanner.cpp index de0844af8abc93..bceac0347b517f 100644 --- a/be/src/information_schema/schema_active_queries_scanner.cpp +++ b/be/src/information_schema/schema_active_queries_scanner.cpp @@ -131,9 +131,9 @@ Status SchemaActiveQueriesScanner::get_next_block_internal(Block* block, bool* e } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_active_query_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_authentication_integrations_scanner.cpp b/be/src/information_schema/schema_authentication_integrations_scanner.cpp index 4cbf55b198d31b..37b9e6811baebb 100644 --- a/be/src/information_schema/schema_authentication_integrations_scanner.cpp +++ b/be/src/information_schema/schema_authentication_integrations_scanner.cpp @@ -134,10 +134,10 @@ Status SchemaAuthenticationIntegrationsScanner::get_next_block_internal(Block* b } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_authentication_integrations_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_backend_active_tasks.cpp b/be/src/information_schema/schema_backend_active_tasks.cpp index ddb15b84aa409d..5849a5ab438e9c 100644 --- a/be/src/information_schema/schema_backend_active_tasks.cpp +++ b/be/src/information_schema/schema_backend_active_tasks.cpp @@ -87,9 +87,9 @@ Status SchemaBackendActiveTasksScanner::get_next_block_internal(Block* block, bo } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_task_stats_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_backend_kerberos_ticket_cache.cpp b/be/src/information_schema/schema_backend_kerberos_ticket_cache.cpp index 5b25a84304d1bb..201262584f1304 100644 --- a/be/src/information_schema/schema_backend_kerberos_ticket_cache.cpp +++ b/be/src/information_schema/schema_backend_kerberos_ticket_cache.cpp @@ -83,9 +83,9 @@ Status SchemaBackendKerberosTicketCacheScanner::get_next_block_internal(Block* b } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_info_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_catalog_meta_cache_stats_scanner.cpp b/be/src/information_schema/schema_catalog_meta_cache_stats_scanner.cpp index 18e490f09b3fed..d00c93a4c4eec9 100644 --- a/be/src/information_schema/schema_catalog_meta_cache_stats_scanner.cpp +++ b/be/src/information_schema/schema_catalog_meta_cache_stats_scanner.cpp @@ -143,9 +143,9 @@ Status SchemaCatalogMetaCacheStatsScanner::get_next_block_internal(Block* block, } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_database_properties_scanner.cpp b/be/src/information_schema/schema_database_properties_scanner.cpp index d1427fe43e915f..dcb4810f171bdb 100644 --- a/be/src/information_schema/schema_database_properties_scanner.cpp +++ b/be/src/information_schema/schema_database_properties_scanner.cpp @@ -147,9 +147,9 @@ Status SchemaDatabasePropertiesScanner::get_next_block_internal(Block* block, bo return Status::OK(); } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_dbproperties_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; if (!check_and_mark_eos(eos)) { *eos = false; diff --git a/be/src/information_schema/schema_file_cache_statistics.cpp b/be/src/information_schema/schema_file_cache_statistics.cpp index 5be2df30d53b11..c8f1243c300ee2 100644 --- a/be/src/information_schema/schema_file_cache_statistics.cpp +++ b/be/src/information_schema/schema_file_cache_statistics.cpp @@ -75,9 +75,9 @@ Status SchemaFileCacheStatisticsScanner::get_next_block_internal(Block* block, b } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_stats_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_partitions_scanner.cpp b/be/src/information_schema/schema_partitions_scanner.cpp index 87c0ce078b787d..4939fb2970247f 100644 --- a/be/src/information_schema/schema_partitions_scanner.cpp +++ b/be/src/information_schema/schema_partitions_scanner.cpp @@ -208,9 +208,9 @@ Status SchemaPartitionsScanner::get_next_block_internal(Block* block, bool* eos) } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_partitions_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; if (!check_and_mark_eos(eos)) { diff --git a/be/src/information_schema/schema_role_mappings_scanner.cpp b/be/src/information_schema/schema_role_mappings_scanner.cpp index 84d0e26eb44393..99e5211fbd88a0 100644 --- a/be/src/information_schema/schema_role_mappings_scanner.cpp +++ b/be/src/information_schema/schema_role_mappings_scanner.cpp @@ -132,9 +132,9 @@ Status SchemaRoleMappingsScanner::get_next_block_internal(Block* block, bool* eo } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_role_mappings_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_sql_block_rule_status_scanner.cpp b/be/src/information_schema/schema_sql_block_rule_status_scanner.cpp index 1fcc0cb838ad93..f52d5399bc4df7 100644 --- a/be/src/information_schema/schema_sql_block_rule_status_scanner.cpp +++ b/be/src/information_schema/schema_sql_block_rule_status_scanner.cpp @@ -167,10 +167,10 @@ Status SchemaSqlBlockRuleStatusScanner::get_next_block_internal(Block* block, bo } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR( mblock.add_rows(_sql_block_rule_status_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_table_options_scanner.cpp b/be/src/information_schema/schema_table_options_scanner.cpp index 717cb91cccfa29..e102853429b868 100644 --- a/be/src/information_schema/schema_table_options_scanner.cpp +++ b/be/src/information_schema/schema_table_options_scanner.cpp @@ -165,9 +165,9 @@ Status SchemaTableOptionsScanner::get_next_block_internal(Block* block, bool* eo } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_tableoptions_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; if (!check_and_mark_eos(eos)) { diff --git a/be/src/information_schema/schema_table_properties_scanner.cpp b/be/src/information_schema/schema_table_properties_scanner.cpp index e89153542a190c..cd6680be7601a4 100644 --- a/be/src/information_schema/schema_table_properties_scanner.cpp +++ b/be/src/information_schema/schema_table_properties_scanner.cpp @@ -159,9 +159,9 @@ Status SchemaTablePropertiesScanner::get_next_block_internal(Block* block, bool* } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_tableproperties_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; if (!check_and_mark_eos(eos)) { diff --git a/be/src/information_schema/schema_table_stream_consumption_scanner.cpp b/be/src/information_schema/schema_table_stream_consumption_scanner.cpp index 6b3141e404bf27..5259b7b8db145b 100644 --- a/be/src/information_schema/schema_table_stream_consumption_scanner.cpp +++ b/be/src/information_schema/schema_table_stream_consumption_scanner.cpp @@ -129,10 +129,10 @@ Status SchemaTableStreamConsumptionScanner::get_next_block_internal(Block* block } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR( mblock.add_rows(_table_stream_consumption_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_table_streams_scanner.cpp b/be/src/information_schema/schema_table_streams_scanner.cpp index 48299c7a1783c6..0c9697341ae929 100644 --- a/be/src/information_schema/schema_table_streams_scanner.cpp +++ b/be/src/information_schema/schema_table_streams_scanner.cpp @@ -130,9 +130,9 @@ Status SchemaTableStreamsScanner::get_next_block_internal(Block* block, bool* eo } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_table_streams_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_view_dependency_scanner.cpp b/be/src/information_schema/schema_view_dependency_scanner.cpp index 3723f4f9e5e2a3..2a7f684b0e1f2b 100644 --- a/be/src/information_schema/schema_view_dependency_scanner.cpp +++ b/be/src/information_schema/schema_view_dependency_scanner.cpp @@ -131,9 +131,9 @@ Status SchemaViewDependencyScanner::get_next_block_internal(Block* block, bool* } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_view_dependency_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_workload_group_privileges.cpp b/be/src/information_schema/schema_workload_group_privileges.cpp index 854e151fd2521d..627344036abc82 100644 --- a/be/src/information_schema/schema_workload_group_privileges.cpp +++ b/be/src/information_schema/schema_workload_group_privileges.cpp @@ -125,10 +125,10 @@ Status SchemaWorkloadGroupPrivilegesScanner::get_next_block_internal(Block* bloc } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR( mblock.add_rows(_workload_groups_privs_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_workload_group_resource_usage_scanner.cpp b/be/src/information_schema/schema_workload_group_resource_usage_scanner.cpp index f790bf913bb75c..ada0151d14b190 100644 --- a/be/src/information_schema/schema_workload_group_resource_usage_scanner.cpp +++ b/be/src/information_schema/schema_workload_group_resource_usage_scanner.cpp @@ -78,9 +78,9 @@ Status SchemaBackendWorkloadGroupResourceUsage::get_next_block_internal(Block* b } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_workload_groups_scanner.cpp b/be/src/information_schema/schema_workload_groups_scanner.cpp index b2dd403f48652b..7375809b45e538 100644 --- a/be/src/information_schema/schema_workload_groups_scanner.cpp +++ b/be/src/information_schema/schema_workload_groups_scanner.cpp @@ -137,9 +137,9 @@ Status SchemaWorkloadGroupsScanner::get_next_block_internal(Block* block, bool* } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_workload_groups_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/information_schema/schema_workload_sched_policy_scanner.cpp b/be/src/information_schema/schema_workload_sched_policy_scanner.cpp index bc5d5f9c229e4c..eb82b26b8769f6 100644 --- a/be/src/information_schema/schema_workload_sched_policy_scanner.cpp +++ b/be/src/information_schema/schema_workload_sched_policy_scanner.cpp @@ -127,9 +127,9 @@ Status SchemaWorkloadSchedulePolicyScanner::get_next_block_internal(Block* block } int current_batch_rows = std::min(_block_rows_limit, _total_rows - _row_idx); - MutableBlock mblock = MutableBlock::build_mutable_block(block); + ScopedMutableBlock scoped_mblock(block); + auto& mblock = scoped_mblock.mutable_block(); RETURN_IF_ERROR(mblock.add_rows(_block.get(), _row_idx, current_batch_rows)); - block->set_columns(std::move(mblock.mutable_columns())); _row_idx += current_batch_rows; *eos = _row_idx == _total_rows; diff --git a/be/src/load/memtable/memtable.cpp b/be/src/load/memtable/memtable.cpp index 3bdcaa1ef961d7..9f5970a70b1568 100644 --- a/be/src/load/memtable/memtable.cpp +++ b/be/src/load/memtable/memtable.cpp @@ -201,9 +201,10 @@ Status MemTable::insert(const Block* input_block, const DorisVector& r if (_is_first_insertion) { _is_first_insertion = false; auto clone_block = input_block->clone_without_columns(&_column_offset); - _input_mutable_block = MutableBlock::build_mutable_block(&clone_block); + _input_mutable_block = MutableBlock::build_mutable_block(std::move(clone_block)); _vec_row_comparator->set_block(&_input_mutable_block); - _output_mutable_block = MutableBlock::build_mutable_block(&clone_block); + clone_block = input_block->clone_without_columns(&_column_offset); + _output_mutable_block = MutableBlock::build_mutable_block(std::move(clone_block)); if (_tablet_schema->has_sequence_col()) { if (_partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { // for unique key fixed partial update, sequence column index in block @@ -390,9 +391,9 @@ Status MemTable::_sort_by_cluster_keys() { _stat.sort_times++; // sort all rows Block in_block = _output_mutable_block.to_block(); - MutableBlock mutable_block = MutableBlock::build_mutable_block(&in_block); auto clone_block = in_block.clone_without_columns(); - _output_mutable_block = MutableBlock::build_mutable_block(&clone_block); + MutableBlock mutable_block = MutableBlock::build_mutable_block(std::move(in_block)); + _output_mutable_block = MutableBlock::build_mutable_block(std::move(clone_block)); DorisVector> row_in_blocks; row_in_blocks.reserve(mutable_block.rows()); @@ -524,7 +525,8 @@ void MemTable::_aggregate() { SCOPED_RAW_TIMER(&_stat.agg_ns); _stat.agg_times++; Block in_block = _input_mutable_block.to_block(); - MutableBlock mutable_block = MutableBlock::build_mutable_block(&in_block); + std::unique_ptr empty_input_block = in_block.create_same_struct_block(0); + MutableBlock mutable_block = MutableBlock::build_mutable_block(std::move(in_block)); _vec_row_comparator->set_block(&mutable_block); DorisVector> temp_row_in_blocks; temp_row_in_blocks.reserve(_last_sorted_pos); @@ -581,8 +583,7 @@ void MemTable::_aggregate() { // if is not final, we collect the agg results to input_block and then continue to insert _input_mutable_block.swap(_output_mutable_block); //TODO(weixang):opt here. - std::unique_ptr empty_input_block = in_block.create_same_struct_block(0); - _output_mutable_block = MutableBlock::build_mutable_block(empty_input_block.get()); + _output_mutable_block = MutableBlock::build_mutable_block(std::move(*empty_input_block)); _output_mutable_block.clear_column_data(); *_row_in_blocks = temp_row_in_blocks; _last_sorted_pos = _row_in_blocks->size(); diff --git a/be/src/runtime/query_cache/query_cache.cpp b/be/src/runtime/query_cache/query_cache.cpp index 06817adf1544ce..28610d44808686 100644 --- a/be/src/runtime/query_cache/query_cache.cpp +++ b/be/src/runtime/query_cache/query_cache.cpp @@ -45,10 +45,10 @@ void QueryCache::insert(const CacheKey& key, int64_t version, CacheResult& res, CacheResult cache_result; for (auto& block_data : res) { cache_result.emplace_back(Block::create_unique())->swap(block_data->clone_empty()); - MutableBlock mutable_block(cache_result.back().get()); + ScopedMutableBlock scoped_mutable_block(cache_result.back().get()); + auto& mutable_block = scoped_mutable_block.mutable_block(); auto st = mutable_block.merge(*block_data); DORIS_CHECK(st.ok()); - cache_result.back()->set_columns(std::move(mutable_block.mutable_columns())); } auto cache_value_ptr = std::make_unique(version, std::move(cache_result), slot_orders); diff --git a/be/src/runtime/result_block_buffer.cpp b/be/src/runtime/result_block_buffer.cpp index aebea97ea1ee90..d644eb03b6184d 100644 --- a/be/src/runtime/result_block_buffer.cpp +++ b/be/src/runtime/result_block_buffer.cpp @@ -214,12 +214,12 @@ Status ResultBlockBuffer::add_batch(RuntimeState* state, (batch_size + _last_batch_bytes) <= config::thrift_max_message_size) { if constexpr (std::is_same_v) { auto last_block = _result_batch_queue.back(); - auto mutable_columns = last_block->mutate_columns(); + auto mutable_columns_guard = last_block->mutate_columns_scoped(); + auto& mutable_columns = mutable_columns_guard.mutable_columns(); for (size_t i = 0; i < last_block->columns(); i++) { mutable_columns[i]->insert_range_from(*result->get_by_position(i).column, 0, num_rows); } - last_block->set_columns(std::move(mutable_columns)); } else { std::vector& back_rows = _result_batch_queue.back()->result_batch.rows; diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index 0d904a9107abd2..5cc80919107632 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -500,8 +500,8 @@ Status PointQueryExecutor::_lookup_row_data() { // 3. get values SCOPED_TIMER(&_profile_metrics.lookup_data_ns); { - MutableColumns result_columns = _result_block->mutate_columns(); - Defer restore_columns([&]() { _result_block->set_columns(std::move(result_columns)); }); + auto result_columns_guard = _result_block->mutate_columns_scoped(); + MutableColumns& result_columns = result_columns_guard.mutable_columns(); for (size_t i = 0; i < _row_read_ctxs.size(); ++i) { if (_row_read_ctxs[i]._cached_row_data.valid()) { RETURN_IF_ERROR(JsonbSerializeUtil::jsonb_to_columns( diff --git a/be/src/storage/iterator/block_reader.cpp b/be/src/storage/iterator/block_reader.cpp index 82358ca7c85899..c9af7fed63bf2d 100644 --- a/be/src/storage/iterator/block_reader.cpp +++ b/be/src/storage/iterator/block_reader.cpp @@ -167,8 +167,8 @@ Status BlockReader::_init_agg_state(const ReaderParams& read_params) { return Status::OK(); } - _stored_data_columns = - _next_row.block->create_same_struct_block(batch_max_rows())->mutate_columns(); + auto stored_block = _next_row.block->create_same_struct_block(batch_max_rows()); + _stored_data_columns = std::move(*stored_block).mutate_columns(); _stored_has_null_tag.resize(_stored_data_columns.size()); _stored_has_variable_length_tag.resize(_stored_data_columns.size()); @@ -344,7 +344,8 @@ Status BlockReader::_replace_key_next_block(Block* block, bool* eof) { } auto target_block_row = 0; - auto target_columns = block->mutate_columns(); + auto target_columns_guard = block->mutate_columns_scoped(); + auto& target_columns = target_columns_guard.mutable_columns(); // currently seq mapping only support mor table // so this will not be executed for the time being if (UNLIKELY(_reader_context.record_rowids)) { @@ -400,7 +401,6 @@ Status BlockReader::_replace_key_next_block(Block* block, bool* eof) { } } _merged_rows += merged_row; - block->set_columns(std::move(target_columns)); return Status::OK(); } @@ -477,7 +477,8 @@ Status BlockReader::_agg_key_next_block(Block* block, bool* eof) { auto target_block_row = 0; auto merged_row = 0; - auto target_columns = block->mutate_columns(); + auto target_columns_guard = block->mutate_columns_scoped(); + auto& target_columns = target_columns_guard.mutable_columns(); RETURN_IF_ERROR(_insert_data_normal(target_columns)); target_block_row++; _append_agg_data(target_columns); @@ -521,7 +522,6 @@ Status BlockReader::_agg_key_next_block(Block* block, bool* eof) { _agg_data_counters.push_back(_last_agg_data_counter); _last_agg_data_counter = 0; _update_agg_data(target_columns); - block->set_columns(std::move(target_columns)); _merged_rows += merged_row; return Status::OK(); @@ -534,7 +534,8 @@ Status BlockReader::_unique_key_next_block(Block* block, bool* eof) { } auto target_block_row = 0; - auto target_columns = block->mutate_columns(); + auto target_columns_guard = block->mutate_columns_scoped(); + auto& target_columns = target_columns_guard.mutable_columns(); if (UNLIKELY(_reader_context.record_rowids)) { _block_row_locations.resize(batch_max_rows()); } @@ -581,7 +582,7 @@ Status BlockReader::_unique_key_next_block(Block* block, bool* eof) { LOG(WARNING) << "tablet_id: " << tablet()->tablet_id() << " delete sign idx " << delete_sign_idx << " not invalid, skip filter delete in base compaction"; - block->set_columns(std::move(target_columns)); + target_columns_guard.restore(); return Status::OK(); } auto delete_filter_column = IColumn::mutate(std::move(_delete_filter_column)); @@ -609,15 +610,13 @@ Status BlockReader::_unique_key_next_block(Block* block, bool* eof) { ColumnWithTypeAndName column_with_type_and_name {_delete_filter_column, std::make_shared(), "__DORIS_COMPACTION_FILTER__"}; - block->set_columns(std::move(target_columns)); + target_columns_guard.restore(); block->insert(column_with_type_and_name); RETURN_IF_ERROR(Block::filter_block(block, target_columns_size, target_columns_size)); _stats.rows_del_filtered += target_block_row - block->rows(); if (UNLIKELY(_reader_context.record_rowids)) { DCHECK_EQ(_block_row_locations.size(), block->rows() + delete_count); } - } else { - block->set_columns(std::move(target_columns)); } return Status::OK(); } diff --git a/be/src/storage/iterator/vcollect_iterator.cpp b/be/src/storage/iterator/vcollect_iterator.cpp index b26e5c5047a5fd..1c8954168310a0 100644 --- a/be/src/storage/iterator/vcollect_iterator.cpp +++ b/be/src/storage/iterator/vcollect_iterator.cpp @@ -292,7 +292,7 @@ Status VCollectIterator::_topn_next(Block* block) { } } } - MutableBlock mutable_block = MutableBlock::build_mutable_block(&clone_block); + MutableBlock mutable_block = MutableBlock::build_mutable_block(std::move(clone_block)); const std::vector* sort_columns = _reader->_reader_context.read_orderby_key_columns; for (auto column_idx : *sort_columns) { @@ -413,7 +413,7 @@ Status VCollectIterator::_topn_next(Block* block) { << mutable_block.rows() << " rows"; Block tmp_block = mutable_block.to_block(); clone_block = tmp_block.clone_empty(); - mutable_block = MutableBlock::build_mutable_block(&clone_block); + mutable_block = MutableBlock::build_mutable_block(std::move(clone_block)); for (auto it = sorted_row_pos.begin(); it != sorted_row_pos.end(); it++) { mutable_block.add_row(&tmp_block, cast_set(*it)); } @@ -843,8 +843,6 @@ bool VCollectIterator::Level1Iterator::collected_enough_rows(const MutableColumn Status VCollectIterator::Level1Iterator::_merge_next(Block* block) { SCOPED_RAW_TIMER(&_reader->_stats.collect_iterator_merge_next_timer); int target_block_row = 0; - auto target_columns = block->mutate_columns(); - size_t column_count = target_columns.size(); IteratorRowRef cur_row = _ref; IteratorRowRef pre_row_ref = _ref; @@ -852,6 +850,9 @@ Status VCollectIterator::Level1Iterator::_merge_next(Block* block) { for (size_t i = block->columns(); i < cur_row.block->columns(); ++i) { block->insert(cur_row.block->get_by_position(i).clone_empty()); } + auto target_columns_guard = block->mutate_columns_scoped(); + auto& target_columns = target_columns_guard.mutable_columns(); + size_t column_count = target_columns.size(); auto batch_size = _reader->batch_max_rows(); if (UNLIKELY(_reader->_reader_context.record_rowids)) { @@ -883,7 +884,6 @@ Status VCollectIterator::Level1Iterator::_merge_next(Block* block) { if (UNLIKELY(_reader->_reader_context.record_rowids)) { _block_row_locations.resize(target_block_row); } - block->set_columns(std::move(target_columns)); return res; } @@ -900,7 +900,6 @@ Status VCollectIterator::Level1Iterator::_merge_next(Block* block) { continuous_row_in_block); } } - block->set_columns(std::move(target_columns)); return Status::OK(); } if (continuous_row_in_block == 0) { @@ -932,7 +931,6 @@ Status VCollectIterator::Level1Iterator::_merge_next(Block* block) { if (UNLIKELY(_reader->_reader_context.record_rowids)) { _block_row_locations.resize(target_block_row); } - block->set_columns(std::move(target_columns)); return Status::OK(); } } while (true); diff --git a/be/src/storage/iterator/vertical_block_reader.cpp b/be/src/storage/iterator/vertical_block_reader.cpp index 335584997f0f92..13a8c0fdecb0f8 100644 --- a/be/src/storage/iterator/vertical_block_reader.cpp +++ b/be/src/storage/iterator/vertical_block_reader.cpp @@ -186,8 +186,8 @@ void VerticalBlockReader::_init_agg_state(const ReaderParams& read_params) { return; } DCHECK(_return_columns.size() == _next_row.block->columns()); - _stored_data_columns = - _next_row.block->create_same_struct_block(_reader_context.batch_size)->mutate_columns(); + auto stored_block = _next_row.block->create_same_struct_block(_reader_context.batch_size); + _stored_data_columns = std::move(*stored_block).mutate_columns(); _stored_has_null_tag.resize(_stored_data_columns.size()); _stored_has_variable_length_tag.resize(_stored_data_columns.size()); @@ -398,7 +398,8 @@ Status VerticalBlockReader::_agg_key_next_block(Block* block, bool* eof) { return Status::OK(); } int target_block_row = 0; - auto target_columns = block->mutate_columns(); + auto target_columns_guard = block->mutate_columns_scoped(); + auto& target_columns = target_columns_guard.mutable_columns(); // copy first row get from collect_iter in init _append_agg_data(target_columns); @@ -413,7 +414,6 @@ Status VerticalBlockReader::_agg_key_next_block(Block* block, bool* eof) { break; } LOG(WARNING) << "next failed: " << res; - block->set_columns(std::move(target_columns)); return res; } DCHECK(_next_row.block->columns() == block->columns()); @@ -431,8 +431,6 @@ Status VerticalBlockReader::_agg_key_next_block(Block* block, bool* eof) { _agg_data_counters.push_back(_last_agg_data_counter); _last_agg_data_counter = 0; _update_agg_data(target_columns); - block->set_columns(std::move(target_columns)); - return Status::OK(); } @@ -484,7 +482,8 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { // delete sign column must store in last column of the block int delete_sign_idx = block->columns() - 1; DCHECK(delete_sign_idx > 0); - auto target_columns = block->mutate_columns(); + auto target_columns_guard = block->mutate_columns_scoped(); + auto& target_columns = target_columns_guard.mutable_columns(); auto delete_filter_column = IColumn::mutate(std::move(_delete_filter_column)); auto* delete_filter_data_column = reinterpret_cast(delete_filter_column.get()); @@ -520,7 +519,7 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { } const auto column_to_keep = target_columns.size(); - block->set_columns(std::move(target_columns)); + target_columns_guard.restore(); _delete_filter_column = std::move(delete_filter_column); ColumnWithTypeAndName column_with_type_and_name {_delete_filter_column, std::make_shared(), @@ -552,7 +551,8 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { } // Value column processing - use batch optimization - auto target_columns = block->mutate_columns(); + auto target_columns_guard = block->mutate_columns_scoped(); + auto& target_columns = target_columns_guard.mutable_columns(); const size_t column_count = block->columns(); // Try to use batch optimization for value column compaction @@ -566,7 +566,6 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { RETURN_IF_ERROR(mask_iter->unique_key_next_batch(&batches, _reader_context.batch_size, &actual_rows)); if (actual_rows == 0) { - block->set_columns(std::move(target_columns)); *eof = true; _eof = true; return Status::OK(); @@ -594,7 +593,6 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { dst_offset += batch.count; } - block->set_columns(std::move(target_columns)); return Status::OK(); } } @@ -610,7 +608,6 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { break; } LOG(WARNING) << "next failed: " << res; - block->set_columns(std::move(target_columns)); return res; } const auto& src_block = _next_row.block; @@ -623,7 +620,6 @@ Status VerticalBlockReader::_unique_key_next_block(Block* block, bool* eof) { }); ++target_block_row; } while (target_block_row < _reader_context.batch_size); - block->set_columns(std::move(target_columns)); return Status::OK(); } diff --git a/be/src/storage/iterator/vgeneric_iterators.cpp b/be/src/storage/iterator/vgeneric_iterators.cpp index 8d3b1bb50ee2f3..17729857a324bd 100644 --- a/be/src/storage/iterator/vgeneric_iterators.cpp +++ b/be/src/storage/iterator/vgeneric_iterators.cpp @@ -63,7 +63,8 @@ Status VStatisticsIterator::next_batch(Block* block) { DCHECK(block->columns() == _column_iterators.size()); if (_output_rows < _target_rows) { block->clear_column_data(); - auto columns = block->mutate_columns(); + auto columns_guard = block->mutate_columns_scoped(); + auto& columns = columns_guard.mutable_columns(); size_t size = _push_down_agg_type_opt == TPushAggOp::MINMAX ? 2 @@ -86,7 +87,6 @@ Status VStatisticsIterator::next_batch(Block* block) { } } } - block->set_columns(std::move(columns)); _output_rows += size; return Status::OK(); } diff --git a/be/src/storage/partial_update_info.cpp b/be/src/storage/partial_update_info.cpp index 3342899de71b6f..a9861aa6ce9171 100644 --- a/be/src/storage/partial_update_info.cpp +++ b/be/src/storage/partial_update_info.cpp @@ -20,6 +20,7 @@ #include #include +#include #include "common/consts.h" #include "common/logging.h" @@ -338,9 +339,11 @@ Status FixedReadPlan::read_columns_by_plan( } } bool has_row_column = tablet_schema.has_row_store_for_all_columns(); - MutableColumns mutable_columns; + std::optional mutable_columns_guard; + MutableColumns* mutable_columns = nullptr; if (!has_row_column) { - mutable_columns = block.mutate_columns(); + mutable_columns_guard.emplace(block); + mutable_columns = &mutable_columns_guard->mutable_columns(); } uint32_t read_idx = 0; for (const auto& [rowset_id, segment_row_mappings] : plan) { @@ -364,10 +367,11 @@ Status FixedReadPlan::read_columns_by_plan( } continue; } - for (size_t cid = 0; cid < mutable_columns.size(); ++cid) { + for (size_t cid = 0; cid < mutable_columns->size(); ++cid) { TabletColumn tablet_column = tablet_schema.column(cids_to_read[cid]); - auto st = doris::BaseTablet::fetch_value_by_rowids( - rowset_iter->second, segment_id, rids, tablet_column, mutable_columns[cid]); + auto st = doris::BaseTablet::fetch_value_by_rowids(rowset_iter->second, segment_id, + rids, tablet_column, + (*mutable_columns)[cid]); // set read value to output block if (!st.ok()) { LOG(WARNING) << "failed to fetch value"; @@ -376,9 +380,6 @@ Status FixedReadPlan::read_columns_by_plan( } } } - if (!has_row_column) { - block.set_columns(std::move(mutable_columns)); - } return Status::OK(); } @@ -388,7 +389,8 @@ Status FixedReadPlan::fill_missing_columns( const TabletSchema& tablet_schema, Block& full_block, const std::vector& use_default_or_null_flag, bool has_default_or_nullable, uint32_t segment_start_pos, const Block* block) const { - auto mutable_full_columns = full_block.mutate_columns(); + auto mutable_full_columns_guard = full_block.mutate_columns_scoped(); + auto& mutable_full_columns = mutable_full_columns_guard.mutable_columns(); // create old value columns DCHECK(historical_context.partial_update_info != nullptr); DCHECK(historical_context.tablet_schema != nullptr); @@ -420,7 +422,8 @@ Status FixedReadPlan::fill_missing_columns( RETURN_IF_ERROR(BaseTablet::generate_default_value_block(tablet_schema, missing_cids, partial_update_info.default_values, old_value_block, default_value_block)); - auto mutable_default_value_columns = default_value_block.mutate_columns(); + auto mutable_default_value_columns_guard = default_value_block.mutate_columns_scoped(); + auto& mutable_default_value_columns = mutable_default_value_columns_guard.mutable_columns(); // fill all missing value from mutable_old_columns, need to consider default value and null value for (auto idx = 0; idx < use_default_or_null_flag.size(); idx++) { @@ -478,7 +481,6 @@ Status FixedReadPlan::fill_missing_columns( } } } - full_block.set_columns(std::move(mutable_full_columns)); return Status::OK(); } @@ -499,7 +501,8 @@ Status FlexibleReadPlan::read_columns_by_plan( const TabletSchema& tablet_schema, const std::map& rsid_to_rowset, Block& old_value_block, std::map>* read_index) const { - auto mutable_columns = old_value_block.mutate_columns(); + auto mutable_columns_guard = old_value_block.mutate_columns_scoped(); + auto& mutable_columns = mutable_columns_guard.mutable_columns(); // cid -> next rid to fill in block std::map next_read_idx; @@ -530,7 +533,6 @@ Status FlexibleReadPlan::read_columns_by_plan( } } // !!!ATTENTION!!!: columns in block may have different size because every row has different columns to update - old_value_block.set_columns(std::move(mutable_columns)); return Status::OK(); } @@ -568,7 +570,8 @@ Status FlexibleReadPlan::fill_non_primary_key_columns( const std::vector& use_default_or_null_flag, bool has_default_or_nullable, uint32_t segment_start_pos, uint32_t block_start_pos, const Block* block, std::vector* skip_bitmaps) const { - auto mutable_full_columns = full_block.mutate_columns(); + auto mutable_full_columns_guard = full_block.mutate_columns_scoped(); + auto& mutable_full_columns = mutable_full_columns_guard.mutable_columns(); DCHECK(historical_context.partial_update_info != nullptr); // missing_cids are all non sort key columns' cids @@ -587,7 +590,6 @@ Status FlexibleReadPlan::fill_non_primary_key_columns( old_value_block, mutable_full_columns, use_default_or_null_flag, has_default_or_nullable, segment_start_pos, block_start_pos, block, skip_bitmaps)); } - full_block.set_columns(std::move(mutable_full_columns)); return Status::OK(); } @@ -974,7 +976,7 @@ Status BlockAggregator::aggregate_for_sequence_column( const auto* delete_signs = BaseTablet::get_delete_sign_column_data(*block, num_rows); auto filtered_block = _tablet_schema.create_block(); - MutableBlock output_block = MutableBlock::build_mutable_block(&filtered_block); + MutableBlock output_block = MutableBlock::build_mutable_block(std::move(filtered_block)); int same_key_rows {0}; std::string previous_key {}; diff --git a/be/src/storage/tablet/base_tablet.cpp b/be/src/storage/tablet/base_tablet.cpp index 98166a20ee071b..b9beda9c052a38 100644 --- a/be/src/storage/tablet/base_tablet.cpp +++ b/be/src/storage/tablet/base_tablet.cpp @@ -876,8 +876,10 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, } Status BaseTablet::sort_block(Block& in_block, Block& output_block) { - MutableBlock mutable_input_block = MutableBlock::build_mutable_block(&in_block); - MutableBlock mutable_output_block = MutableBlock::build_mutable_block(&output_block); + ScopedMutableBlock scoped_input_block(&in_block); + auto& mutable_input_block = scoped_input_block.mutable_block(); + ScopedMutableBlock scoped_output_block(&output_block); + auto& mutable_output_block = scoped_output_block.mutable_block(); std::shared_ptr vec_row_comparator = std::make_shared(_tablet_meta->tablet_schema()); @@ -903,10 +905,9 @@ Status BaseTablet::sort_block(Block& in_block, Block& output_block) { for (auto& block : row_in_blocks) { row_pos_vec.emplace_back(block->_row_pos); } - in_block.set_columns(std::move(mutable_input_block.mutable_columns())); + scoped_input_block.restore(); RETURN_IF_ERROR(mutable_output_block.add_rows(&in_block, row_pos_vec.data(), row_pos_vec.data() + input_rows)); - output_block.set_columns(std::move(mutable_output_block.mutable_columns())); return Status::OK(); } @@ -992,7 +993,8 @@ Status BaseTablet::generate_default_value_block(const TabletSchema& schema, const std::vector& default_values, const Block& ref_block, Block& default_value_block) { - auto mutable_default_value_columns = default_value_block.mutate_columns(); + auto mutable_default_value_columns_guard = default_value_block.mutate_columns_scoped(); + auto& mutable_default_value_columns = mutable_default_value_columns_guard.mutable_columns(); for (auto i = 0; i < cids.size(); ++i) { const auto& column = schema.column(cids[i]); if (column.has_default_value()) { @@ -1002,7 +1004,6 @@ Status BaseTablet::generate_default_value_block(const TabletSchema& schema, str, *mutable_default_value_columns[i])); } } - default_value_block.set_columns(std::move(mutable_default_value_columns)); return Status::OK(); } @@ -1016,7 +1017,8 @@ Status BaseTablet::generate_new_block_for_partial_update( // 3. write a new segment and modify rowset meta // 4. mark current keys deleted CHECK(output_block); - auto full_mutable_columns = output_block->mutate_columns(); + auto full_mutable_columns_guard = output_block->mutate_columns_scoped(); + auto& full_mutable_columns = full_mutable_columns_guard.mutable_columns(); const auto& missing_cids = partial_update_info->missing_cids; const auto& update_cids = partial_update_info->update_cids; auto old_block = rowset_schema->create_block_by_cids(missing_cids); @@ -1119,7 +1121,7 @@ Status BaseTablet::generate_new_block_for_partial_update( } } } - output_block->set_columns(std::move(full_mutable_columns)); + full_mutable_columns_guard.restore(); VLOG_DEBUG << "full block when publish: " << output_block->dump_data(); return Status::OK(); } @@ -1224,7 +1226,8 @@ Status BaseTablet::generate_new_block_for_flexible_partial_update( old_block, default_value_block)); // 4. build the final block - auto full_mutable_columns = output_block->mutate_columns(); + auto full_mutable_columns_guard = output_block->mutate_columns_scoped(); + auto& full_mutable_columns = full_mutable_columns_guard.mutable_columns(); DCHECK(rowset_schema->has_skip_bitmap_col()); auto skip_bitmap_col_idx = rowset_schema->skip_bitmap_col_idx(); const std::vector* skip_bitmaps = @@ -1277,7 +1280,7 @@ Status BaseTablet::generate_new_block_for_flexible_partial_update( DCHECK_EQ(full_mutable_columns[cid]->size(), update_rows); } - output_block->set_columns(std::move(full_mutable_columns)); + full_mutable_columns_guard.restore(); VLOG_DEBUG << "full block when publish: " << output_block->dump_data(); return Status::OK(); } diff --git a/be/src/util/jsonb/serialize.cpp b/be/src/util/jsonb/serialize.cpp index 6de5c155077c4d..6ff4a076f89025 100644 --- a/be/src/util/jsonb/serialize.cpp +++ b/be/src/util/jsonb/serialize.cpp @@ -80,8 +80,8 @@ Status JsonbSerializeUtil::jsonb_to_block( const std::unordered_map& col_id_to_idx, Block& dst, const std::vector& default_values, const std::unordered_set& include_cids) { - MutableColumns dst_columns = dst.mutate_columns(); - Defer restore_columns([&]() { dst.set_columns(std::move(dst_columns)); }); + auto dst_columns_guard = dst.mutate_columns_scoped(); + MutableColumns& dst_columns = dst_columns_guard.mutable_columns(); for (int i = 0; i < jsonb_column.size(); ++i) { StringRef jsonb_data = jsonb_column.get_data_at(i); RETURN_IF_ERROR(jsonb_to_columns(serdes, jsonb_data.data, jsonb_data.size, col_id_to_idx, @@ -156,8 +156,8 @@ Status JsonbSerializeUtil::jsonb_to_block( const std::unordered_map& col_id_to_idx, Block& dst, const std::vector& default_values, const std::unordered_set& include_cids) { - MutableColumns dst_columns = dst.mutate_columns(); - Defer restore_columns([&]() { dst.set_columns(std::move(dst_columns)); }); + auto dst_columns_guard = dst.mutate_columns_scoped(); + MutableColumns& dst_columns = dst_columns_guard.mutable_columns(); return jsonb_to_columns(serdes, data, size, col_id_to_idx, dst_columns, default_values, include_cids); } diff --git a/be/test/core/block/block_test.cpp b/be/test/core/block/block_test.cpp index 26f8dc91ec3d9f..6d9389f26c9947 100644 --- a/be/test/core/block/block_test.cpp +++ b/be/test/core/block/block_test.cpp @@ -895,11 +895,11 @@ TEST(BlockTest, merge_with_shared_columns) { Block temp_block({test_k1_temp, test_v1_temp, test_v2_temp}); - MutableBlock mutable_block(&src_block); + ScopedMutableBlock scoped_mutable_block(&src_block); + auto& mutable_block = scoped_mutable_block.mutable_block(); auto status = mutable_block.merge(temp_block); ASSERT_TRUE(status.ok()); - - src_block.set_columns(std::move(mutable_block.mutable_columns())); + scoped_mutable_block.restore(); for (auto& column : src_block.get_columns()) { EXPECT_EQ(1034, column->size()); @@ -1023,7 +1023,7 @@ TEST(BlockTest, merge_impl_ignore_overflow) { block.insert(ColumnHelper::create_column_with_name({})); auto block2 = ColumnHelper::create_block({}); - auto mutable_block = MutableBlock::build_mutable_block(&block); + auto mutable_block = MutableBlock::build_mutable_block(std::move(block)); auto st = mutable_block.merge_ignore_overflow(std::move(block2)); ASSERT_FALSE(st.ok()); @@ -1274,7 +1274,8 @@ TEST(BlockTest, others) { ASSERT_EQ(block.get_by_position(0).type->get_primitive_type(), TYPE_INT); ASSERT_EQ(block.columns(), 1); - MutableBlock mutable_block(&block); + ScopedMutableBlock scoped_mutable_block(&block); + auto& mutable_block = scoped_mutable_block.mutable_block(); auto dumped = mutable_block.dump_data(); ASSERT_GT(dumped.size(), 0) << "Dumped data size: " << dumped.size(); auto dumped_json = mutable_block.dump_data_json(); @@ -1316,4 +1317,97 @@ TEST(BlockTest, ClearSelectedColumnDataClonesSharedColumn) { EXPECT_EQ(block.get_by_position(1).column.get(), old_col1.get()); } +TEST(BlockTest, ScopedMutableColumnsRestoreOnErrorAndDetachSharedColumn) { + auto type = std::make_shared(); + auto mutable_col = ColumnInt32::create(); + mutable_col->insert_value(1); + mutable_col->insert_value(2); + ColumnPtr old_col = mutable_col->get_ptr(); + + Block block; + block.insert({std::move(mutable_col), type, "c0"}); + + auto status = [&]() -> Status { + auto columns_guard = block.mutate_columns_scoped(); + columns_guard.mutable_columns()[0]->insert(Field::create_field(3)); + return Status::InternalError("force early return"); + }(); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(block.rows(), 3); + EXPECT_EQ(old_col->size(), 2); + EXPECT_NE(block.get_by_position(0).column.get(), old_col.get()); +} + +TEST(BlockTest, ScopedMutableColumnsReadSchemaFromLiveBlock) { + auto type = std::make_shared(); + auto mutable_col = ColumnInt32::create(); + mutable_col->insert_value(1); + + Block block; + block.insert({std::move(mutable_col), type, "c0"}); + + auto columns_guard = block.mutate_columns_scoped(); + EXPECT_EQ(block.get_by_position(0).column.get(), nullptr); + EXPECT_EQ(&columns_guard.get_datatype_by_position(0), &block.get_by_position(0).type); + EXPECT_EQ(&columns_guard.get_name_by_position(0), &block.get_by_position(0).name); + EXPECT_EQ(columns_guard.get_datatype_by_position(0).get(), type.get()); + EXPECT_EQ(columns_guard.get_name_by_position(0), "c0"); +} + +TEST(BlockTest, ScopedMutableColumnRestoreOnErrorDetachSharedAndCreateMissingColumn) { + auto type = std::make_shared(); + auto mutable_col = ColumnInt32::create(); + mutable_col->insert_value(1); + mutable_col->insert_value(2); + ColumnPtr old_col = mutable_col->get_ptr(); + + Block block; + block.insert({std::move(mutable_col), type, "c0"}); + block.insert({nullptr, type, "empty"}); + + auto status = [&]() -> Status { + auto column_guard = block.mutate_column_scoped(0); + EXPECT_EQ(block.get_by_position(0).column.get(), nullptr); + column_guard.mutable_column()->insert(Field::create_field(3)); + return Status::InternalError("force early return"); + }(); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(block.get_by_position(0).column->size(), 3); + EXPECT_EQ(old_col->size(), 2); + EXPECT_NE(block.get_by_position(0).column.get(), old_col.get()); + + { + auto column_guard = block.mutate_column_scoped(1); + EXPECT_EQ(block.get_by_position(1).column.get(), nullptr); + column_guard.mutable_column()->insert(Field::create_field(10)); + } + + ASSERT_NE(block.get_by_position(1).column.get(), nullptr); + EXPECT_EQ(block.get_by_position(1).column->size(), 1); +} + +TEST(BlockTest, ScopedMutableBlockRestoreOnErrorAndDetachSharedColumn) { + auto type = std::make_shared(); + auto mutable_col = ColumnInt32::create(); + mutable_col->insert_value(1); + mutable_col->insert_value(2); + ColumnPtr old_col = mutable_col->get_ptr(); + + Block block; + block.insert({std::move(mutable_col), type, "c0"}); + + auto status = [&]() -> Status { + ScopedMutableBlock scoped_block(&block); + scoped_block.mutable_columns()[0]->insert(Field::create_field(3)); + return Status::InternalError("force early return"); + }(); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(block.rows(), 3); + EXPECT_EQ(old_col->size(), 2); + EXPECT_NE(block.get_by_position(0).column.get(), old_col.get()); +} + } // namespace doris diff --git a/be/test/core/column/column_nullable_test.cpp b/be/test/core/column/column_nullable_test.cpp index 64ca053c50362f..799cc6a9826059 100644 --- a/be/test/core/column/column_nullable_test.cpp +++ b/be/test/core/column/column_nullable_test.cpp @@ -49,7 +49,7 @@ TEST(ColumnNullableTest, NullTest) { dst_col->clear(); EXPECT_FALSE(dst_col->has_null()); dst_col->insert_range_from( - *ColumnNullable::create(std::move(source_col), ColumnUInt8::create(10, 0)), 5, 5); + *ColumnNullable::create(std::move(source_col), ColumnUInt8::create(100, 0)), 5, 5); EXPECT_FALSE(dst_col->has_null()); dst_col->clear(); @@ -81,6 +81,15 @@ TEST(ColumnNullableTest, NullTest) { EXPECT_TRUE(dst_col->has_null()); } +TEST(ColumnNullableTest, CreateRejectsMismatchedNestedAndNullMapSizes) { + EXPECT_THROW( + { + auto nullable = ColumnNullable::create(create_nested_column(100), + ColumnUInt8::create(10, 0)); + }, + doris::Exception); +} + TEST(ColumnNullableTest, PredicateTest) { auto nullable_pred = ColumnNullable::create(PredicateColumnType::create(), ColumnUInt8::create()); diff --git a/be/test/core/column/common_column_test.h b/be/test/core/column/common_column_test.h index fe0ecf051d0140..4a283670daf029 100644 --- a/be/test/core/column/common_column_test.h +++ b/be/test/core/column/common_column_test.h @@ -638,7 +638,7 @@ class CommonColumnTest : public ::testing::Test { columnTypeAndName.type = types[i]; block.insert(columnTypeAndName); } - MutableBlock mb = MutableBlock::build_mutable_block(&block); + MutableBlock mb = MutableBlock::build_mutable_block(std::move(block)); // Rebuild block from load_cols after build_mutable_block stole the column pointers for (size_t i = 0; i < load_cols.size(); ++i) { block.get_by_position(i).column = load_cols[i]->get_ptr(); @@ -653,7 +653,7 @@ class CommonColumnTest : public ::testing::Test { assert_block.insert(columnTypeAndName); empty_block.insert(columnTypeAndName); } - MutableBlock assert_mb = MutableBlock::build_mutable_block(&empty_block); + MutableBlock assert_mb = MutableBlock::build_mutable_block(std::move(empty_block)); // step3. to insert data from load_cols to assert_cols Status st = mb.merge_impl_ignore_overflow(assert_block); EXPECT_TRUE(st.ok()) << "Failed to merge block: " << st.to_string(); @@ -3950,4 +3950,4 @@ auto assert_byte_size_with_file_callback = [](const MutableColumns& load_cols, test_func(false); }; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/core/data_type/complex_type_test.cpp b/be/test/core/data_type/complex_type_test.cpp index 50e89ce4daf208..ff9f66c36128d3 100644 --- a/be/test/core/data_type/complex_type_test.cpp +++ b/be/test/core/data_type/complex_type_test.cpp @@ -109,12 +109,14 @@ TEST(ComplexTypeTest, DeserializeArrayWritesBackSharedNestedColumn) { auto buf = serialize_column(array_type, src_column->get_ptr()); ColumnPtr shared_nested_column = ColumnInt32::create(); - MutableColumnPtr dst_column = ColumnArray::create(shared_nested_column); + ColumnPtr shared_offsets_column = ColumnArray::ColumnOffsets::create(); + MutableColumnPtr dst_column = ColumnArray::create(shared_nested_column, shared_offsets_column); deserialize_column(array_type, buf, &dst_column); const auto& array_column = assert_cast(*dst_column); EXPECT_EQ(2, array_column.size()); EXPECT_EQ(0, shared_nested_column->size()); + EXPECT_EQ(0, shared_offsets_column->size()); EXPECT_EQ(3, array_column.get_data().size()); EXPECT_EQ(2, array_column.get_offsets()[0]); EXPECT_EQ(3, array_column.get_offsets()[1]); @@ -150,6 +152,7 @@ TEST(ComplexTypeTest, DeserializeMapWritesBackSharedKeyAndValueColumns) { EXPECT_EQ(1, map_column.size()); EXPECT_EQ(0, shared_keys_column->size()); EXPECT_EQ(0, shared_values_column->size()); + EXPECT_EQ(0, offsets_column->size()); EXPECT_EQ(2, map_column.get_keys().size()); EXPECT_EQ(2, map_column.get_values().size()); diff --git a/be/test/exec/common/schema_util_rowset_test.cpp b/be/test/exec/common/schema_util_rowset_test.cpp index 18bc77ccb3f883..cdfb84431d395a 100644 --- a/be/test/exec/common/schema_util_rowset_test.cpp +++ b/be/test/exec/common/schema_util_rowset_test.cpp @@ -148,7 +148,7 @@ static void fill_varaint_column(auto& variant_column, int size, int uid) { } static void fill_block_with_test_data(Block* block, int size) { - auto columns = block->mutate_columns(); + auto columns = std::move(*block).mutate_columns(); // insert key for (int i = 0; i < size; i++) { auto field = Field::create_field(i); diff --git a/be/test/exec/exchange/exchange_writer_test.cpp b/be/test/exec/exchange/exchange_writer_test.cpp index 28481d08eb3e73..bbc81a623777ca 100644 --- a/be/test/exec/exchange/exchange_writer_test.cpp +++ b/be/test/exec/exchange/exchange_writer_test.cpp @@ -82,7 +82,7 @@ class RowExpandingPartitioner final : public PartitionerBase { _channel_ids.assign(block->rows(), 0); - auto mutable_columns = block->mutate_columns(); + auto mutable_columns = std::move(*block).mutate_columns(); for (size_t col_idx = 0; col_idx < mutable_columns.size(); ++col_idx) { mutable_columns[col_idx]->insert_from(*mutable_columns[col_idx], 0); } diff --git a/be/test/exec/operator/table_function_operator_test.cpp b/be/test/exec/operator/table_function_operator_test.cpp index 93bef8246ae95d..b6057ede0f5e9b 100644 --- a/be/test/exec/operator/table_function_operator_test.cpp +++ b/be/test/exec/operator/table_function_operator_test.cpp @@ -1478,7 +1478,7 @@ TEST_F(UnnestTest, inner) { unnested_tag_column->insert_data((const char*)(ids.data()), 0); expected_output_block.insert(ColumnWithTypeAndName( make_nullable(std::move(unnested_tag_column)), data_type_int_nullable, "tag")); - auto mutable_columns = expected_output_block.mutate_columns(); + auto mutable_columns = std::move(expected_output_block).mutate_columns(); mutable_columns[0]->insert_from( *table_func_local_state->_child_block->get_by_position(0).column, 0); mutable_columns[0]->insert_from( @@ -1587,7 +1587,7 @@ TEST_F(UnnestTest, outer) { unnested_tag_column->insert_data((const char*)(ids.data()), 0); expected_output_block.insert(ColumnWithTypeAndName( make_nullable(std::move(unnested_tag_column)), data_type_int_nullable, "tag")); - auto mutable_columns = expected_output_block.mutate_columns(); + auto mutable_columns = std::move(expected_output_block).mutate_columns(); mutable_columns[0]->insert_from( *table_func_local_state->_child_block->get_by_position(0).column, 0); mutable_columns[0]->insert_from( @@ -1613,7 +1613,7 @@ TEST_F(UnnestTest, outer) { output_block.clear(); expected_output_block.clear_column_data(); - mutable_columns = expected_output_block.mutate_columns(); + mutable_columns = std::move(expected_output_block).mutate_columns(); mutable_columns[0]->insert_from( *table_func_local_state->_child_block->get_by_position(0).column, 1); mutable_columns[1]->insert_default(); diff --git a/be/test/exec/pipeline/local_exchanger_test.cpp b/be/test/exec/pipeline/local_exchanger_test.cpp index 3051625a3ee530..af02a5802411fa 100644 --- a/be/test/exec/pipeline/local_exchanger_test.cpp +++ b/be/test/exec/pipeline/local_exchanger_test.cpp @@ -21,7 +21,9 @@ #include #include "common/status.h" +#include "core/assert_cast.h" #include "core/column/column.h" +#include "core/column/column_const.h" #include "core/column/column_vector.h" #include "core/data_type/data_type.h" #include "exec/exchange/local_exchange_sink_operator.h" @@ -1090,6 +1092,13 @@ TEST_F(LocalExchangerTest, AdaptivePassthroughExchanger) { _local_states[i].get()}), Status::OK()); EXPECT_EQ(block.rows(), j == 1 ? 0 : num_rows_per_block); + if (j == 0) { + const auto& data = + assert_cast(*block.get_by_position(0).column) + .get_data(); + EXPECT_EQ(data.front(), i); + EXPECT_EQ(data.back(), i); + } EXPECT_FALSE(eos); EXPECT_EQ(_local_states[i]->_dependency->ready(), j != 1); } @@ -1307,4 +1316,82 @@ TEST_F(LocalExchangerTest, TestShuffleExchangerWrongMap) { .is()); } } + +TEST_F(LocalExchangerTest, ShuffleExchangerRestoreOutputBlockOnAddRowsError) { + const int num_sink = 1; + const int num_sources = 1; + const int num_partitions = 1; + const int free_block_limit = 0; + std::map shuffle_idx_to_instance_idx {{0, 0}}; + + auto profile = std::make_shared(""); + auto shared_state = LocalExchangeSharedState::create_shared(num_partitions); + shared_state->exchanger = ShuffleExchanger::create_unique(num_sink, num_sources, num_partitions, + free_block_limit); + auto sink_dep = std::make_shared(0, 0, "LOCAL_EXCHANGE_SINK_DEPENDENCY", true); + sink_dep->set_shared_state(shared_state.get()); + shared_state->sink_deps.push_back(sink_dep); + shared_state->create_source_dependencies(num_sources, 0, 0, "TEST"); + + auto* exchanger = (ShuffleExchanger*)shared_state->exchanger.get(); + auto sink_local_state = std::make_unique(nullptr, nullptr); + sink_local_state->_exchanger = shared_state->exchanger.get(); + sink_local_state->_compute_hash_value_timer = ADD_TIMER(profile, "ComputeHashValueTime"); + sink_local_state->_distribute_timer = ADD_TIMER(profile, "DistributeTimer"); + sink_local_state->_partitioner = + std::make_unique>(num_partitions); + sink_local_state->_channel_id = 0; + sink_local_state->_shared_state = shared_state.get(); + sink_local_state->_dependency = sink_dep.get(); + sink_local_state->_memory_used_counter = + profile->AddHighWaterMarkCounter("SinkMemoryUsage", TUnit::BYTES, "", 1); + + auto source_local_state = + std::make_unique(_runtime_state.get(), nullptr); + source_local_state->_exchanger = shared_state->exchanger.get(); + source_local_state->_get_block_failed_counter = ADD_TIMER(profile, "GetBlockFailedCounter"); + source_local_state->_copy_data_timer = ADD_TIMER(profile, "CopyDataTimer"); + source_local_state->_channel_id = 0; + source_local_state->_shared_state = shared_state.get(); + source_local_state->_dependency = shared_state->get_dep_by_channel_id(0).front().get(); + source_local_state->_memory_used_counter = + profile->AddHighWaterMarkCounter("MemoryUsage", TUnit::BYTES, "", 1); + shared_state->mem_counters[0] = source_local_state->_memory_used_counter; + + DataTypePtr int_type = std::make_shared(); + Block in_block; + auto in_col = ColumnInt32::create(); + in_col->insert_many_vals(7, 2); + in_block.insert({std::move(in_col), int_type, "test_int_col0"}); + bool in_eos = false; + SinkInfo sink_info = {.channel_id = &sink_local_state->_channel_id, + .partitioner = sink_local_state->_partitioner.get(), + .local_state = sink_local_state.get(), + .shuffle_idx_to_instance_idx = &shuffle_idx_to_instance_idx, + .ins_idx = 0}; + EXPECT_EQ(exchanger->sink(_runtime_state.get(), &in_block, in_eos, + {sink_local_state->_compute_hash_value_timer, + sink_local_state->_distribute_timer, nullptr}, + sink_info), + Status::OK()); + + Block output_block; + auto const_value = ColumnInt32::create(); + const_value->insert_many_vals(42, 1); + output_block.insert( + {ColumnConst::create(const_value->get_ptr(), 1), int_type, "test_int_col0"}); + + bool eos = false; + const auto status = + exchanger->get_block(_runtime_state.get(), &output_block, &eos, + {nullptr, nullptr, source_local_state->_copy_data_timer}, + {source_local_state->_channel_id, source_local_state.get()}); + EXPECT_FALSE(status.ok()); + ASSERT_EQ(output_block.columns(), 1); + const auto& restored_column = output_block.get_by_position(0).column; + ASSERT_NE(restored_column.get(), nullptr); + EXPECT_TRUE(is_column(*restored_column)); + EXPECT_EQ(output_block.rows(), 1); + EXPECT_NO_THROW(output_block.check_number_of_rows()); +} } // namespace doris diff --git a/be/test/exprs/aggregate/vec_count_by_enum_test.cpp b/be/test/exprs/aggregate/vec_count_by_enum_test.cpp index b60b7ab7045da6..cbb9a5888d6e3f 100644 --- a/be/test/exprs/aggregate/vec_count_by_enum_test.cpp +++ b/be/test/exprs/aggregate/vec_count_by_enum_test.cpp @@ -32,6 +32,22 @@ namespace doris { void register_aggregate_function_count_by_enum(AggregateFunctionSimpleFactory& factory); +static ColumnPtr create_nullable_gender_column() { + auto column_f1 = ColumnString::create(); + column_f1->insert(Field::create_field("F")); + column_f1->insert(Field::create_field("F")); + column_f1->insert(Field::create_field("M")); + column_f1->insert_default(); + column_f1->insert_default(); + + auto null_map = ColumnUInt8::create(); + std::vector offs = {0, 0, 0, 1, 1}; + for (int i = 0; i < offs.size(); ++i) { + null_map->insert(Field::create_field(offs[i])); + } + return ColumnNullable::create(std::move(column_f1), std::move(null_map)); +} + class VCountByEnumTest : public testing::Test { public: AggregateFunctionPtr agg_function; @@ -129,18 +145,7 @@ TEST_F(VCountByEnumTest, testNotNullableSample) { TEST_F(VCountByEnumTest, testNullableSample) { Arena arena; const int batch_size = 5; - auto column_f1 = ColumnString::create(); - column_f1->insert(Field::create_field("F")); - column_f1->insert(Field::create_field("F")); - column_f1->insert(Field::create_field("M")); - ColumnPtr column_f1_ptr = std::move(column_f1); - auto null_map = ColumnUInt8::create(); - std::vector offs = {0, 0, 0, 1, 1}; - for (int i = 0; i < offs.size(); ++i) { - null_map->insert(Field::create_field(offs[i])); - } - - auto nullable_column_f1 = ColumnNullable::create(column_f1_ptr, std::move(null_map)); + auto nullable_column_f1 = create_nullable_gender_column(); std::unique_ptr memory(new char[agg_function->size_of_data()]); AggregateDataPtr place = memory.get(); @@ -176,18 +181,7 @@ TEST_F(VCountByEnumTest, testNullableSample) { TEST_F(VCountByEnumTest, testNoMerge) { Arena arena; const int batch_size = 5; - auto column_f1 = ColumnString::create(); - column_f1->insert(Field::create_field("F")); - column_f1->insert(Field::create_field("F")); - column_f1->insert(Field::create_field("M")); - ColumnPtr column_f1_ptr = std::move(column_f1); - auto null_map = ColumnUInt8::create(); - std::vector offs = {0, 0, 0, 1, 1}; - for (int i = 0; i < offs.size(); ++i) { - null_map->insert(Field::create_field(offs[i])); - } - - auto nullable_column_f1 = ColumnNullable::create(column_f1_ptr, std::move(null_map)); + auto nullable_column_f1 = create_nullable_gender_column(); std::unique_ptr memory(new char[agg_function->size_of_data()]); AggregateDataPtr place = memory.get(); @@ -216,17 +210,7 @@ TEST_F(VCountByEnumTest, testNoMerge) { TEST_F(VCountByEnumTest, testSerialize) { Arena arena; const int batch_size = 5; - auto column_f1 = ColumnString::create(); - column_f1->insert(Field::create_field("F")); - column_f1->insert(Field::create_field("F")); - column_f1->insert(Field::create_field("M")); - ColumnPtr column_f1_ptr = std::move(column_f1); - auto null_map = ColumnUInt8::create(); - std::vector offs = {0, 0, 0, 1, 1}; - for (int i = 0; i < offs.size(); ++i) { - null_map->insert(Field::create_field(offs[i])); - } - auto nullable_column_f1 = ColumnNullable::create(column_f1_ptr, std::move(null_map)); + auto nullable_column_f1 = create_nullable_gender_column(); std::unique_ptr memory(new char[agg_function->size_of_data()]); AggregateDataPtr place = memory.get(); @@ -262,17 +246,7 @@ TEST_F(VCountByEnumTest, testSerialize) { EXPECT_EQ(item0["null"].GetInt(), 2); EXPECT_EQ(item0["all"].GetInt(), 5); - auto column_f1_2 = ColumnString::create(); - column_f1_2->insert(Field::create_field("F")); - column_f1_2->insert(Field::create_field("F")); - column_f1_2->insert(Field::create_field("M")); - ColumnPtr column_f1_2_ptr = std::move(column_f1_2); - auto null_map_2 = ColumnUInt8::create(); - std::vector offs_2 = {0, 0, 0, 1, 1}; - for (int i = 0; i < offs.size(); ++i) { - null_map_2->insert(Field::create_field(offs_2[i])); - } - auto nullable_column_f1_2 = ColumnNullable::create(column_f1_2_ptr, std::move(null_map_2)); + auto nullable_column_f1_2 = create_nullable_gender_column(); std::unique_ptr memory3(new char[agg_function->size_of_data()]); AggregateDataPtr place3 = memory3.get(); diff --git a/be/test/exprs/function/cast/function_variant_cast_test.cpp b/be/test/exprs/function/cast/function_variant_cast_test.cpp index 8f710188b40290..f48b213f86e524 100644 --- a/be/test/exprs/function/cast/function_variant_cast_test.cpp +++ b/be/test/exprs/function/cast/function_variant_cast_test.cpp @@ -426,8 +426,7 @@ TEST(FunctionVariantCast, CastFromVariantWithEmptyRoot) { auto result_col = block.get_by_position(result_column).column; ASSERT_NE(result_col.get(), nullptr); const auto* string_result = assert_cast(result_col.get()); - // just call ConvertImplGenericToString which will insert all source column data to ColumnString - ASSERT_EQ(string_result->size(), variant_col->size()); + ASSERT_EQ(string_result->size(), 1); ASSERT_EQ(string_result->get_data_at(0).to_string(), "{\"v\":{\"a\":20,\"b\":\"20\",\"c\":20,\"e\":\"50\",\"f\":20}}"); } @@ -436,7 +435,9 @@ TEST(FunctionVariantCast, CastFromVariantWithEmptyRoot) { { auto variant_col = construct_basic_varint_column(); variant_col->finalize(); - auto nullable_variant_col = make_nullable(variant_col->get_ptr()); + const auto rows = variant_col->size(); + auto nullable_variant_col = + ColumnNullable::create(std::move(variant_col), ColumnUInt8::create(rows, 0)); auto nullable_string_type = make_nullable(std::make_shared()); auto variant_type = std::make_shared(); @@ -461,7 +462,47 @@ TEST(FunctionVariantCast, CastFromVariantWithEmptyRoot) { ASSERT_NE(result_col.get(), nullptr); const auto* nullable_result = assert_cast(result_col.get()); ASSERT_EQ(nullable_result->size(), 1); - ASSERT_TRUE(nullable_result->is_null_at(1)); + ASSERT_FALSE(nullable_result->is_null_at(0)); + const auto* string_result = + assert_cast(&nullable_result->get_nested_column()); + ASSERT_EQ(string_result->get_data_at(0).to_string(), + "{\"v\":{\"a\":20,\"b\":\"20\",\"c\":20,\"e\":\"50\",\"f\":20}}"); + } + + // Test case 5: nullable source null-map is preserved after the nested string cast is limited + // to input_rows_count. + { + auto variant_col = construct_basic_varint_column(); + variant_col->finalize(); + auto null_map = ColumnUInt8::create(variant_col->size(), 0); + null_map->get_data()[0] = 1; + auto nullable_variant_col = + ColumnNullable::create(std::move(variant_col), std::move(null_map)); + + auto nullable_string_type = make_nullable(std::make_shared()); + auto variant_type = std::make_shared(); + auto nullable_variant_type = make_nullable(variant_type); + + ColumnsWithTypeAndName arguments { + {nullable_variant_col->get_ptr(), nullable_variant_type, "variant_col"}, + {nullptr, nullable_string_type, "nullable_string_type"}}; + + auto function = SimpleFunctionFactory::instance().get_function("CAST", arguments, + nullable_string_type); + ASSERT_NE(function, nullptr); + + Block block {arguments}; + size_t result_column = block.columns(); + block.insert({nullptr, nullable_string_type, "result"}); + RuntimeState state; + auto ctx = FunctionContext::create_context(&state, {}, {}); + ASSERT_TRUE(function->execute(ctx.get(), block, {0}, result_column, 1).ok()); + + auto result_col = block.get_by_position(result_column).column; + ASSERT_NE(result_col.get(), nullptr); + const auto* nullable_result = assert_cast(result_col.get()); + ASSERT_EQ(nullable_result->size(), 1); + ASSERT_TRUE(nullable_result->is_null_at(0)); } } @@ -611,4 +652,4 @@ TEST(FunctionVariantCast, CastFromVariantStrictModeRegression) { } } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/exprs/function/function_is_null_test.cpp b/be/test/exprs/function/function_is_null_test.cpp index d637175f05272a..e6420aedce9124 100644 --- a/be/test/exprs/function/function_is_null_test.cpp +++ b/be/test/exprs/function/function_is_null_test.cpp @@ -158,7 +158,7 @@ TEST_F(FunctionIsNullTest, gc_binlogs_test) { const auto& rowset_writer = res.value(); Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); Field key = Field::create_field(10); Field v1 = Field::create_field("v1"); @@ -323,7 +323,7 @@ TEST_F(FunctionIsNullTest, evaluate_inverted_index_corner_cases) { const auto& rowset_writer = res.value(); Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Create block with NO null values to test the scenario where // iterator might not have null bitmap or it's nullptr diff --git a/be/test/format/native/native_reader_writer_test.cpp b/be/test/format/native/native_reader_writer_test.cpp index cf568354925b17..0f38721558217b 100644 --- a/be/test/format/native/native_reader_writer_test.cpp +++ b/be/test/format/native/native_reader_writer_test.cpp @@ -527,10 +527,10 @@ TEST_F(NativeReaderWriterTest, round_trip_native_file_large_rows) { total_read_rows = read_rows; first_block = false; } else { - MutableBlock merged_mutable(&merged_block); + ScopedMutableBlock scoped_merged_mutable(&merged_block); + auto& merged_mutable = scoped_merged_mutable.mutable_block(); Status add_st = merged_mutable.add_rows(&dst_block, 0, read_rows); ASSERT_TRUE(add_st.ok()) << add_st; - merged_block.set_columns(std::move(merged_mutable.mutable_columns())); total_read_rows += read_rows; } } diff --git a/be/test/format/parquet/parquet_column_convert_test.cpp b/be/test/format/parquet/parquet_column_convert_test.cpp index 256d1bb3a49bd6..112390442dc593 100644 --- a/be/test/format/parquet/parquet_column_convert_test.cpp +++ b/be/test/format/parquet/parquet_column_convert_test.cpp @@ -23,6 +23,7 @@ #include #include "core/assert_cast.h" +#include "core/column/column_fixed_length_object.h" #include "core/column/column_nullable.h" #include "core/column/column_vector.h" #include "util/timezone_utils.h" @@ -170,6 +171,8 @@ TEST(ParquetColumnConvertTest, AlignNullMapUsesNullablePrefixForCachedReadColumn ColumnNullable::create(std::move(dst_nested_column), std::move(dst_null_map_column)); auto src_nested_column = ColumnInt64::create(); + src_nested_column->insert_value(8); + src_nested_column->insert_value(9); src_nested_column->insert_value(10); src_nested_column->insert_value(11); src_nested_column->insert_value(12); @@ -254,10 +257,10 @@ TEST(ParquetColumnConvertTest, &field_schema, field_schema.data_type, dst_type, nullptr); ASSERT_TRUE(converter->support()) << converter->get_error_msg(); - auto src_nested_column = ColumnUInt8::create(); - for (auto ch : std::string("aabbcc")) { - src_nested_column->insert_value(static_cast(ch)); - } + auto src_nested_column = ColumnFixedLengthObject::create(2); + src_nested_column->insert_data("aa", 2); + src_nested_column->insert_data("bb", 2); + src_nested_column->insert_data("cc", 2); auto src_null_map_column = ColumnUInt8::create(); src_null_map_column->insert_value(0); src_null_map_column->insert_value(1); diff --git a/be/test/format/table/table_format_reader_test.cpp b/be/test/format/table/table_format_reader_test.cpp index 09ab7e10b2ec73..1a1b3176df6700 100644 --- a/be/test/format/table/table_format_reader_test.cpp +++ b/be/test/format/table/table_format_reader_test.cpp @@ -19,13 +19,58 @@ #include +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "runtime/descriptors.h" + namespace doris { class MockTableFormatReader : public TableFormatReader { public: Status _do_get_next_block(Block*, size_t*, bool*) override { return Status::OK(); } + + void set_fill_col_name_to_block_idx(std::unordered_map* index) { + _fill_col_name_to_block_idx = index; + } + + void set_partition_value(const std::string& col_name, const std::string& value, + const SlotDescriptor* slot_desc) { + _fill_partition_values[col_name] = {value, slot_desc}; + } }; +static TTypeDesc create_scalar_type(TPrimitiveType::type primitive_type) { + TTypeDesc type_desc; + TTypeNode type_node; + TScalarType scalar_type; + scalar_type.__set_type(primitive_type); + type_node.__set_type(TTypeNodeType::SCALAR); + type_node.__set_scalar_type(scalar_type); + type_desc.types.push_back(type_node); + return type_desc; +} + +static TSlotDescriptor create_slot_descriptor(int slot_id, const std::string& col_name, + TPrimitiveType::type primitive_type, + bool nullable = false) { + TSlotDescriptor slot_desc; + slot_desc.__set_id(slot_id); + slot_desc.__set_parent(1); + slot_desc.__set_slotType(create_scalar_type(primitive_type)); + slot_desc.__set_columnPos(slot_id); + slot_desc.__set_colName(col_name); + slot_desc.__set_col_unique_id(slot_id); + slot_desc.__set_slotIdx(slot_id); + slot_desc.__set_isMaterialized(true); + slot_desc.__set_is_key(false); + slot_desc.__set_nullIndicatorBit(nullable ? 0 : -1); + return slot_desc; +} + TEST(TableFormatReaderTest, FillSynthesizedColumnsInvokesRegisteredHandlers) { MockTableFormatReader reader; size_t handled_rows = 0; @@ -48,4 +93,60 @@ TEST(TableFormatReaderTest, FillSynthesizedColumnsInvokesRegisteredHandlers) { EXPECT_EQ(handled_rows, 128u); } +TEST(TableFormatReaderTest, FillPartitionColumnRestoresSharedColumnOnDeserializeError) { + MockTableFormatReader reader; + std::unordered_map block_index {{"part_col", 0}}; + reader.set_fill_col_name_to_block_idx(&block_index); + + auto slot_desc = create_slot_descriptor(0, "part_col", TPrimitiveType::INT); + SlotDescriptor slot(slot_desc); + reader.set_partition_value("part_col", "not_an_int", &slot); + + auto type = std::make_shared(); + auto mutable_col = ColumnInt32::create(); + ColumnPtr old_col = mutable_col->get_ptr(); + + Block block; + block.insert({std::move(mutable_col), type, "part_col"}); + + auto status = reader.on_fill_partition_columns(&block, 2, {"part_col"}); + + EXPECT_FALSE(status.ok()); + ASSERT_EQ(block.columns(), 1); + ASSERT_NE(block.get_by_position(0).column.get(), nullptr); + EXPECT_EQ(block.get_by_position(0).column->size(), 0); + EXPECT_EQ(old_col->size(), 0); + EXPECT_NE(block.get_by_position(0).column.get(), old_col.get()); + EXPECT_NO_THROW(block.check_number_of_rows()); +} + +TEST(TableFormatReaderTest, FillMissingNullableColumnDetachesSharedBlockSlot) { + MockTableFormatReader reader; + std::unordered_map block_index {{"missing_col", 0}}; + reader.set_fill_col_name_to_block_idx(&block_index); + + auto nullable_type = make_nullable(std::make_shared()); + auto nullable_col = ColumnNullable::create(ColumnInt32::create(), ColumnUInt8::create()); + ColumnPtr old_col = nullable_col->get_ptr(); + + Block block; + block.insert({std::move(nullable_col), nullable_type, "missing_col"}); + + auto status = reader.on_fill_missing_columns(&block, 3, {"missing_col"}); + + EXPECT_TRUE(status.ok()) << status.to_string(); + ASSERT_EQ(block.columns(), 1); + ASSERT_NE(block.get_by_position(0).column.get(), nullptr); + EXPECT_EQ(block.rows(), 3); + EXPECT_EQ(old_col->size(), 0); + EXPECT_NE(block.get_by_position(0).column.get(), old_col.get()); + + const auto& nullable = assert_cast(*block.get_by_position(0).column); + const auto& null_map = nullable.get_null_map_data(); + ASSERT_EQ(null_map.size(), 3); + EXPECT_EQ(null_map[0], 1); + EXPECT_EQ(null_map[1], 1); + EXPECT_EQ(null_map[2], 1); +} + } // namespace doris diff --git a/be/test/load/delta_writer/delta_writer_cluster_key_test.cpp b/be/test/load/delta_writer/delta_writer_cluster_key_test.cpp index c8a40194ff0803..5590ecaa93ada1 100644 --- a/be/test/load/delta_writer/delta_writer_cluster_key_test.cpp +++ b/be/test/load/delta_writer/delta_writer_cluster_key_test.cpp @@ -191,7 +191,7 @@ static TDescriptorTable create_descriptor_tablet_with_sequence_col() { } static void generate_data(Block* block, int8_t k1, int16_t k2, int32_t seq) { - auto columns = block->mutate_columns(); + auto columns = std::move(*block).mutate_columns(); int8_t c1 = k1; columns[0]->insert_data((const char*)&c1, sizeof(c1)); diff --git a/be/test/load/delta_writer/delta_writer_test.cpp b/be/test/load/delta_writer/delta_writer_test.cpp index 0ce52ceea706eb..5d3aebc4e2325f 100644 --- a/be/test/load/delta_writer/delta_writer_test.cpp +++ b/be/test/load/delta_writer/delta_writer_test.cpp @@ -446,7 +446,7 @@ static TDescriptorTable create_descriptor_tablet_with_sequence_col() { } static void generate_data(Block* block, int8_t k1, int16_t k2, int32_t seq) { - auto columns = block->mutate_columns(); + auto columns = std::move(*block).mutate_columns(); int8_t c1 = k1; columns[0]->insert_data((const char*)&c1, sizeof(c1)); @@ -569,7 +569,7 @@ TEST_F(TestDeltaWriter, vec_write) { slot_desc->col_name())); } - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); { int8_t k1 = -127; columns[0]->insert_data((const char*)&k1, sizeof(k1)); diff --git a/be/test/load/memtable/memtable_memory_limiter_test.cpp b/be/test/load/memtable/memtable_memory_limiter_test.cpp index f3566448a1f29b..551010e0709cb1 100644 --- a/be/test/load/memtable/memtable_memory_limiter_test.cpp +++ b/be/test/load/memtable/memtable_memory_limiter_test.cpp @@ -154,7 +154,7 @@ TEST_F(MemTableMemoryLimiterTest, handle_memtable_flush_test) { block.insert(ColumnWithTypeAndName(slot_desc->get_empty_mutable_column(), slot_desc->type(), slot_desc->col_name())); } - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); { int8_t k1 = -127; columns[0]->insert_data((const char*)&k1, sizeof(k1)); diff --git a/be/test/runtime/snapshot_loader_test.cpp b/be/test/runtime/snapshot_loader_test.cpp index 209ab1139a406b..efae696a62c492 100644 --- a/be/test/runtime/snapshot_loader_test.cpp +++ b/be/test/runtime/snapshot_loader_test.cpp @@ -210,8 +210,8 @@ static void add_rowset(int64_t tablet_id, int32_t schema_hash, int64_t partition slot_desc->col_name())); } - std::cout << "total column " << block.mutate_columns().size() << std::endl; - auto columns = block.mutate_columns(); + std::cout << "total column " << block.columns() << std::endl; + auto columns = std::move(block).mutate_columns(); int16_t c1 = value; columns[0]->insert_data((const char*)&c1, sizeof(c1)); block.set_columns(std::move(columns)); diff --git a/be/test/storage/compaction/ordered_data_compaction_test.cpp b/be/test/storage/compaction/ordered_data_compaction_test.cpp index 2f3d654023a825..333cad0cdfb308 100644 --- a/be/test/storage/compaction/ordered_data_compaction_test.cpp +++ b/be/test/storage/compaction/ordered_data_compaction_test.cpp @@ -304,7 +304,7 @@ class OrderedDataCompactionTest : public ::testing::Test { uint32_t num_rows = 0; for (int i = 0; i < rowset_data.size(); ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rowset_data[i].size(); ++rid) { int32_t c1 = std::get<0>(rowset_data[i][rid]); int32_t c2 = std::get<1>(rowset_data[i][rid]); @@ -574,7 +574,7 @@ TEST_F(OrderedDataCompactionTest, test_index_disk_size) { uint32_t num_rows = 0; for (int j = 0; j < input_data[i].size(); ++j) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < input_data[i][j].size(); ++rid) { int32_t c1 = std::get<0>(input_data[i][j][rid]); int32_t c2 = std::get<1>(input_data[i][j][rid]); diff --git a/be/test/storage/compaction/segcompaction_mow_test.cpp b/be/test/storage/compaction/segcompaction_mow_test.cpp index 13b836cd5a38d3..28a9fed8021eb2 100644 --- a/be/test/storage/compaction/segcompaction_mow_test.cpp +++ b/be/test/storage/compaction/segcompaction_mow_test.cpp @@ -107,7 +107,7 @@ class SegCompactionMoWTest : public ::testing::TestWithParam { MutableColumns* columns) { block->set_columns(std::move(*columns)); auto st = rowset_writer->add_block(block); - *columns = block->mutate_columns(); + *columns = std::move(*block).mutate_columns(); return st; } @@ -343,7 +343,7 @@ TEST_P(SegCompactionMoWTest, SegCompactionThenRead) { // k3 := rid for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -449,7 +449,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { int segid = 0; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -477,7 +477,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -505,7 +505,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -533,7 +533,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -562,7 +562,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { std::map unique_keys; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { // generate some duplicate rows, segment compaction will merge them int rand_i = rand() % (num_segments - 3); @@ -601,7 +601,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -679,7 +679,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { int segid = 0; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -707,7 +707,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -735,7 +735,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -763,7 +763,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -791,7 +791,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionInterleaveWithBig_OoOoO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + segid; uint32_t k2 = segid; @@ -864,7 +864,7 @@ TEST_F(SegCompactionMoWTest, SegCompactionNotTrigger) { // k3 := rid for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; diff --git a/be/test/storage/compaction/segcompaction_test.cpp b/be/test/storage/compaction/segcompaction_test.cpp index 15dc86c89d74b2..d3b843c050da2e 100644 --- a/be/test/storage/compaction/segcompaction_test.cpp +++ b/be/test/storage/compaction/segcompaction_test.cpp @@ -128,7 +128,7 @@ class SegCompactionTest : public testing::Test { MutableColumns* columns) { block->set_columns(std::move(*columns)); auto st = rowset_writer->add_block(block); - *columns = block->mutate_columns(); + *columns = std::move(*block).mutate_columns(); return st; } @@ -315,7 +315,7 @@ TEST_F(SegCompactionTest, SegCompactionThenRead) { // k3 := rid for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -436,7 +436,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { uint32_t rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -454,7 +454,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -472,7 +472,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -490,7 +490,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -508,7 +508,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -527,7 +527,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_ooooOOoOooooooooO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -590,7 +590,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { uint32_t rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -608,7 +608,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -626,7 +626,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -644,7 +644,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { rows_per_segment = 4096; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -662,7 +662,7 @@ TEST_F(SegCompactionTest, SegCompactionInterleaveWithBig_OoOoO) { rows_per_segment = 6400; for (int i = 0; i < num_segments; ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { uint32_t k1 = rid * 100 + i; uint32_t k2 = i; @@ -718,7 +718,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadUniqueTableSmall) { uint32_t k3 = 0; Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // segment#0 k1 = k2 = 1; k3 = 1; @@ -984,7 +984,7 @@ TEST_F(SegCompactionTest, SegCompactionThenReadAggTableSmall) { uint32_t k3 = 0; Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // segment#0 k1 = k2 = 1; diff --git a/be/test/storage/compaction/variant_doc_mode_compaction_test.cpp b/be/test/storage/compaction/variant_doc_mode_compaction_test.cpp index 58ce82f38cbc4c..f234cf8e34cc89 100644 --- a/be/test/storage/compaction/variant_doc_mode_compaction_test.cpp +++ b/be/test/storage/compaction/variant_doc_mode_compaction_test.cpp @@ -251,7 +251,7 @@ class VariantDocModeCompactionTest : public ::testing::Test { auto rowset_writer = std::move(res).value(); Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); auto* variant_col = assert_cast(columns[1].get()); auto raw_json_column = ColumnString::create(); raw_json_column->reserve(kRowsPerSegment); diff --git a/be/test/storage/compaction/vertical_compaction_test.cpp b/be/test/storage/compaction/vertical_compaction_test.cpp index 1b4abe72257e76..542c12fd91945a 100644 --- a/be/test/storage/compaction/vertical_compaction_test.cpp +++ b/be/test/storage/compaction/vertical_compaction_test.cpp @@ -110,7 +110,7 @@ class VerticalCompactionTest : public ::testing::Test { MutableColumns* columns) { block->set_columns(std::move(*columns)); auto st = rowset_writer->add_block(block); - *columns = block->mutate_columns(); + *columns = std::move(*block).mutate_columns(); return st; } @@ -249,7 +249,7 @@ class VerticalCompactionTest : public ::testing::Test { uint32_t num_rows = 0; for (int i = 0; i < rowset_data.size(); ++i) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rowset_data[i].size(); ++rid) { int32_t c1 = std::get<0>(rowset_data[i][rid]); int32_t c2 = std::get<1>(rowset_data[i][rid]); @@ -1210,7 +1210,7 @@ TEST_F(VerticalCompactionTest, TestUniqueKeyVerticalMergeWithNullableSparseColum // Create block with nullable c2 column Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rows_per_segment; ++rid) { int32_t c1 = i * rows_per_segment + rid; @@ -1389,7 +1389,7 @@ TEST_F(VerticalCompactionTest, TestFooterRawDataBytesAccuracy) { auto rowset_writer = std::move(res).value(); Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int i = 0; i < kNumRows; i++) { int32_t int_val = i; columns[0]->insert_data(reinterpret_cast(&int_val), sizeof(int_val)); @@ -1485,7 +1485,7 @@ TEST_F(VerticalCompactionTest, TestFooterRawDataBytesNullableSparse) { auto rowset_writer = std::move(res).value(); Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int i = 0; i < kNumRows; i++) { int32_t key_val = i; columns[0]->insert_data(reinterpret_cast(&key_val), sizeof(key_val)); diff --git a/be/test/storage/index/date_bloom_filter_test.cpp b/be/test/storage/index/date_bloom_filter_test.cpp index 261c49a92d6595..263f2a44f0177d 100644 --- a/be/test/storage/index/date_bloom_filter_test.cpp +++ b/be/test/storage/index/date_bloom_filter_test.cpp @@ -131,7 +131,7 @@ TEST_F(DateBloomFilterTest, query_index_test) { const auto& rowset_writer = res.value(); Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); auto date = timestamp_from_date("2024-11-08"); auto datetime = timestamp_from_datetime("2024-11-08 09:00:00"); @@ -225,7 +225,7 @@ TEST_F(DateBloomFilterTest, in_list_predicate_test) { const auto& rowset_writer = res.value(); Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Insert test data auto date = timestamp_from_date("2024-11-08"); diff --git a/be/test/storage/index/index_builder_test.cpp b/be/test/storage/index/index_builder_test.cpp index 96cc6839390e3a..dd36ba3ab33159 100644 --- a/be/test/storage/index/index_builder_test.cpp +++ b/be/test/storage/index/index_builder_test.cpp @@ -244,7 +244,7 @@ TEST_F(IndexBuilderTest, DropInvertedIndexTest) { // 5. Write data to the rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -534,7 +534,7 @@ TEST_F(IndexBuilderTest, BuildInvertedIndexAfterWritingDataTest) { // 4. Write data to the rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns according to the schema for (int i = 0; i < num_rows; ++i) { @@ -865,7 +865,7 @@ TEST_F(IndexBuilderTest, AddIndexWhenOneExistsTest) { // 5. Write data to rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -1035,7 +1035,7 @@ TEST_F(IndexBuilderTest, AddIndexWhenOneExistsTestV1) { // 8. Write data to rowset { Block block = v1_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -1186,7 +1186,7 @@ TEST_F(IndexBuilderTest, MultiSegmentBuildIndexTest) { // 4. Write data to the rowset in multiple batches to ensure we get multiple segments for (int segment = 0; segment < num_segments; segment++) { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < rows_per_segment; ++i) { @@ -1338,7 +1338,7 @@ TEST_F(IndexBuilderTest, NonExistentColumnIndexTest) { // 4. Write data to the rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -1514,7 +1514,7 @@ TEST_F(IndexBuilderTest, RenameColumnIndexTest) { // 5. Write data to the rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -1670,7 +1670,7 @@ TEST_F(IndexBuilderTest, AddNonExistentColumnIndexWhenOneExistsTest) { // 5. Write data to the rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -1844,7 +1844,7 @@ TEST_F(IndexBuilderTest, AddNonExistentColumnIndexWhenOneExistsTestV1) { // 9. Write data to rowset { Block block = v1_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -1996,7 +1996,7 @@ TEST_F(IndexBuilderTest, NonNullIndexDataTest) { // 4. Write non-null data to the rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns with no null values for (int i = 0; i < num_rows; ++i) { @@ -2122,7 +2122,7 @@ TEST_F(IndexBuilderTest, NonExistentColumnUniqueIdTest) { // 4. Write data to the rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -2255,7 +2255,7 @@ TEST_F(IndexBuilderTest, DropIndexV1FormatTest) { // 9. Write data to the rowset { Block block = v1_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -2381,7 +2381,7 @@ TEST_F(IndexBuilderTest, ResourceCleanupTest) { // 4. Write data to the rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { @@ -2538,7 +2538,7 @@ TEST_F(IndexBuilderTest, ArrayTypeIndexTest) { // 7. Create data block and write data { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Prepare columns for k1 and array_col for (int i = 0; i < 1000; i++) { @@ -2646,7 +2646,7 @@ TEST_F(IndexBuilderTest, UniqueKeysTableIndexTest) { { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < 1000; ++i) { @@ -2806,7 +2806,7 @@ TEST_F(IndexBuilderTest, HandleSingleRowsetErrorTest) { { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < 1000; ++i) { @@ -2928,7 +2928,7 @@ TEST_F(IndexBuilderTest, UpdateInvertedIndexInfoErrorTest) { // Write data { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < 1000; ++i) { @@ -3045,7 +3045,7 @@ TEST_F(IndexBuilderTest, DropOneIndexNotAffectOtherIndexesOnSameColumnTest) { // 5. Write data to the rowset { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); // Add data for k1 and k2 columns for (int i = 0; i < num_rows; ++i) { diff --git a/be/test/storage/index/inverted/common/inverted_index_gc_binlogs_test.cpp b/be/test/storage/index/inverted/common/inverted_index_gc_binlogs_test.cpp index c9856eeaa53ec1..ea7a71510f9086 100644 --- a/be/test/storage/index/inverted/common/inverted_index_gc_binlogs_test.cpp +++ b/be/test/storage/index/inverted/common/inverted_index_gc_binlogs_test.cpp @@ -148,7 +148,7 @@ TEST_F(IndexGcBinglogsTest, gc_binlogs_test) { const auto& rowset_writer = res.value(); Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); Field key = Field::create_field(10); Field v1 = Field::create_field("v1"); diff --git a/be/test/storage/index/inverted/compaction/util/index_compaction_utils.cpp b/be/test/storage/index/inverted/compaction/util/index_compaction_utils.cpp index b253a7f8d2d985..6ca726770cfb5d 100644 --- a/be/test/storage/index/inverted/compaction/util/index_compaction_utils.cpp +++ b/be/test/storage/index/inverted/compaction/util/index_compaction_utils.cpp @@ -658,7 +658,7 @@ class IndexCompactionUtils { const auto& rowset_writer = res.value(); Block block = schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (const auto& row : data[i]) { if constexpr (std::is_same_v) { Field key = Field::create_field(int32_t(row.key)); diff --git a/be/test/storage/iterator/block_reader_agg_flush_test.cpp b/be/test/storage/iterator/block_reader_agg_flush_test.cpp index a9c0a4a4818d82..77376285198fce 100644 --- a/be/test/storage/iterator/block_reader_agg_flush_test.cpp +++ b/be/test/storage/iterator/block_reader_agg_flush_test.cpp @@ -72,7 +72,8 @@ std::unique_ptr make_source_block(size_t n_rows, int64_t key_value) { // struct of the source block, pre-filled with `n_rows` default rows so that // non-variable-length agg columns can be written via replace_column_data. MutableColumns make_stored_columns(const Block& src_block, size_t n_rows) { - return src_block.create_same_struct_block(n_rows)->mutate_columns(); + auto block = src_block.create_same_struct_block(n_rows); + return std::move(*block).mutate_columns(); } MutableColumns make_target_columns() { diff --git a/be/test/storage/rowid_conversion_test.cpp b/be/test/storage/rowid_conversion_test.cpp index 0d470d1c7a7e74..c8b3bad9336a04 100644 --- a/be/test/storage/rowid_conversion_test.cpp +++ b/be/test/storage/rowid_conversion_test.cpp @@ -192,7 +192,7 @@ class TestRowIdConversion : public testing::TestWithParamcreate_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (int rid = 0; rid < rowset_data[i].size(); ++rid) { int32_t c1 = std::get<0>(rowset_data[i][rid]); int32_t c2 = std::get<1>(rowset_data[i][rid]); diff --git a/be/test/storage/segment/segment_cache_test.cpp b/be/test/storage/segment/segment_cache_test.cpp index 04b395f0089c89..a48c50eaa0216d 100644 --- a/be/test/storage/segment/segment_cache_test.cpp +++ b/be/test/storage/segment/segment_cache_test.cpp @@ -183,7 +183,7 @@ static TDescriptorTable create_descriptor_tablet_with_sequence_col() { } static void generate_data(Block* block, int8_t k1, int16_t k2, int32_t seq) { - auto columns = block->mutate_columns(); + auto columns = std::move(*block).mutate_columns(); int8_t c1 = k1; columns[0]->insert_data((const char*)&c1, sizeof(c1)); diff --git a/be/test/storage/segment/segments_key_bounds_truncation_test.cpp b/be/test/storage/segment/segments_key_bounds_truncation_test.cpp index b9cad3c63b3eb7..10ee96e1c8e5d1 100644 --- a/be/test/storage/segment/segments_key_bounds_truncation_test.cpp +++ b/be/test/storage/segment/segments_key_bounds_truncation_test.cpp @@ -182,7 +182,7 @@ class SegmentsKeyBoundsTruncationTest : public testing::Test { int const_value = 999; for (const auto& segment_rows : data) { Block block = tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); for (const auto& row : segment_rows) { columns[0]->insert_data(row.data(), row.size()); columns[1]->insert_data(reinterpret_cast(&const_value), diff --git a/be/test/storage/segment/variant_column_writer_reader_test.cpp b/be/test/storage/segment/variant_column_writer_reader_test.cpp index 37b6887ab54500..517469331c32e1 100644 --- a/be/test/storage/segment/variant_column_writer_reader_test.cpp +++ b/be/test/storage/segment/variant_column_writer_reader_test.cpp @@ -264,7 +264,7 @@ class VariantColumnWriterReaderTest : public testing::Test { for (const auto& batch : batches) { Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); auto variant_col = ColumnVariant::create( _tablet_schema->column(0).variant_max_subcolumns_count(), false); auto json_col = ColumnString::create(); @@ -305,7 +305,7 @@ class VariantColumnWriterReaderTest : public testing::Test { } Block block = _tablet_schema->create_block(); - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); auto variant_col = ColumnVariant::create( _tablet_schema->column(0).variant_max_subcolumns_count(), false); auto json_col = ColumnString::create(); diff --git a/be/test/storage/tablet/tablet_cooldown_test.cpp b/be/test/storage/tablet/tablet_cooldown_test.cpp index acf16442537fbd..182274341acdb2 100644 --- a/be/test/storage/tablet/tablet_cooldown_test.cpp +++ b/be/test/storage/tablet/tablet_cooldown_test.cpp @@ -334,7 +334,7 @@ static void write_rowset(TabletSharedPtr* tablet, PUniqueId load_id, int64_t rep slot_desc->col_name())); } Status st; - auto columns = block.mutate_columns(); + auto columns = std::move(block).mutate_columns(); if (with_data) { int8_t c1 = 123; From 56d0cf4f9d4f5a93981f5ef0d31476571bb8de22 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Mon, 18 May 2026 12:31:18 +0800 Subject: [PATCH 03/11] [fix](be) Handle parquet fixed byte array object decoders ### What problem does this PR solve? Issue Number: close #xxx Related PR: #63001 Problem Summary: Parquet FIXED_LEN_BYTE_ARRAY physical columns are now read through ColumnFixedLengthObject for COW-safe reuse, but several decoder paths still assumed primitive fixed-size data types. Dictionary and byte-stream-split decoders called get_size_of_value_in_memory() on DataTypeFixedLengthObject, and delta byte array fixed-length decode still wrote through a ColumnInt8 owner. This could fail external scans with DataTypeFixedLengthObject size checks or write through the wrong column representation. This change teaches the fixed-length parquet decoders to use the ColumnFixedLengthObject item size when that physical column is used, keeps the legacy primitive/ColumnInt8 paths, and adds BE UT coverage for plain, dictionary, byte-stream-split, and delta byte array fixed-length object decoding including filter/null cases. ### Release note None ### Check List (For Author) - Test: Unit Test - ./run-be-ut.sh --run --filter=FixLengthPlainDecoderTest.*:FixLengthDictDecoderTest.*:ByteStreamSplitDecoderTest.*:DeltaByteArrayDecoderTest.*:DeltaLengthByteArrayDecoderTest.*:ParquetColumnConvertTest.*:ParquetReaderTest.uuid_varbinary:ParquetReaderTest.varbinary_varbinary:ParquetReaderTest.varbinary_string:ParquetReaderTest.varbinary_string2 - Behavior changed: No - Does this need documentation: No --- .../parquet/byte_stream_split_decoder.cpp | 9 +- .../format/parquet/delta_bit_pack_decoder.h | 22 +++- .../parquet/fix_length_dict_decoder.hpp | 9 +- .../byte_stream_split_decoder_test.cpp | 84 +++++++++++- .../parquet/delta_byte_array_decoder_test.cpp | 122 ++++++++++++++++++ .../parquet/fix_length_dict_decoder_test.cpp | 77 +++++++++++ .../parquet/fix_length_plain_decoder_test.cpp | 76 +++++++++++ 7 files changed, 391 insertions(+), 8 deletions(-) diff --git a/be/src/format/parquet/byte_stream_split_decoder.cpp b/be/src/format/parquet/byte_stream_split_decoder.cpp index a2674d0995c433..30f0958d9c8cf5 100644 --- a/be/src/format/parquet/byte_stream_split_decoder.cpp +++ b/be/src/format/parquet/byte_stream_split_decoder.cpp @@ -19,6 +19,7 @@ #include +#include "core/column/column_fixed_length_object.h" #include "util/byte_stream_split.h" namespace doris { @@ -45,7 +46,13 @@ Status ByteStreamSplitDecoder::_decode_values(MutableColumnPtr& doris_column, _offset, non_null_size, _data->size); } - size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); + size_t primitive_length = _type_length; + if (const auto* fixed_length_column = + check_and_get_column(*doris_column)) { + DCHECK_EQ(fixed_length_column->item_size(), _type_length); + } else { + primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); + } size_t data_index = doris_column->size() * primitive_length; size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) * (_type_length / primitive_length); diff --git a/be/src/format/parquet/delta_bit_pack_decoder.h b/be/src/format/parquet/delta_bit_pack_decoder.h index 52d45ea2297b33..6257e4f214a182 100644 --- a/be/src/format/parquet/delta_bit_pack_decoder.h +++ b/be/src/format/parquet/delta_bit_pack_decoder.h @@ -30,6 +30,8 @@ #include #include "common/status.h" +#include "core/column/column_fixed_length_object.h" +#include "core/column/column_vector.h" #include "core/data_type/data_type.h" #include "format/parquet/decoder.h" #include "format/parquet/fix_length_plain_decoder.h" @@ -84,11 +86,21 @@ class DeltaDecoder : public Decoder { Status decode_fixed_byte_array(const std::vector& decoded_vals, MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) { - auto& column_data = reinterpret_cast(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + _type_length * (select_vector.num_values() - - select_vector.num_filtered())); - auto* data = column_data.data(); + const size_t result_size = select_vector.num_values() - select_vector.num_filtered(); + size_t data_index = 0; + uint8_t* data = nullptr; + if (auto* fixed_length_column = + check_and_get_column(*doris_column)) { + DCHECK_EQ(fixed_length_column->item_size(), _type_length); + data_index = fixed_length_column->size() * _type_length; + fixed_length_column->resize(fixed_length_column->size() + result_size); + data = fixed_length_column->get_data().data(); + } else { + auto& column_data = assert_cast(*doris_column).get_data(); + data_index = column_data.size(); + column_data.resize(data_index + _type_length * result_size); + data = reinterpret_cast(column_data.data()); + } ColumnSelectVector::DataReadType read_type; int value_idx = 0; while (size_t run_length = select_vector.get_next_run(&read_type)) { diff --git a/be/src/format/parquet/fix_length_dict_decoder.hpp b/be/src/format/parquet/fix_length_dict_decoder.hpp index c0f0dd967a7dec..aef4e7e6a19bb5 100644 --- a/be/src/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/format/parquet/fix_length_dict_decoder.hpp @@ -18,6 +18,7 @@ #pragma once #include "core/column/column_dictionary.h" +#include "core/column/column_fixed_length_object.h" #include "core/column/column_nullable.h" #include "core/data_type/data_type_nullable.h" #include "format/parquet/decoder.h" @@ -107,7 +108,13 @@ class FixLengthDictDecoder final : public BaseDictDecoder { template Status _decode_fixed_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) { - size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); + size_t primitive_length = _type_length; + if (const auto* fixed_length_column = + check_and_get_column(*doris_column)) { + DCHECK_EQ(fixed_length_column->item_size(), _type_length); + } else { + primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); + } size_t data_index = doris_column->size() * primitive_length; size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) * (_type_length / primitive_length); diff --git a/be/test/format/parquet/byte_stream_split_decoder_test.cpp b/be/test/format/parquet/byte_stream_split_decoder_test.cpp index 379918d479b6bf..b332a13449a99d 100644 --- a/be/test/format/parquet/byte_stream_split_decoder_test.cpp +++ b/be/test/format/parquet/byte_stream_split_decoder_test.cpp @@ -19,7 +19,9 @@ #include +#include "core/column/column_fixed_length_object.h" #include "core/column/column_vector.h" +#include "core/data_type/data_type_fixed_length_object.h" #include "core/data_type/data_type_number.h" #include "util/slice.h" @@ -32,6 +34,24 @@ class ByteStreamSplitDecoderTest : public ::testing::Test { ByteStreamSplitDecoder _decoder; }; +static std::vector encode_byte_stream_split_fixed_length( + const std::vector& values, size_t type_length) { + std::vector encoded(values.size() * type_length); + for (size_t value_index = 0; value_index < values.size(); ++value_index) { + DCHECK_EQ(values[value_index].size(), type_length); + for (size_t byte_index = 0; byte_index < type_length; ++byte_index) { + encoded[byte_index * values.size() + value_index] = + static_cast(values[value_index][byte_index]); + } + } + return encoded; +} + +static std::string fixed_length_value(const ColumnFixedLengthObject& column, size_t row) { + const auto value = column.get_data_at(row); + return {value.data, value.size}; +} + //// Test basic decoding functionality for FLOAT type TEST_F(ByteStreamSplitDecoderTest, test_basic_decode_float) { // Prepare test data for FLOAT type @@ -118,6 +138,36 @@ TEST_F(ByteStreamSplitDecoderTest, test_basic_decode_double) { EXPECT_DOUBLE_EQ(result_column->get_data()[2], 3.0); } +TEST_F(ByteStreamSplitDecoderTest, test_basic_decode_fixed_length_object) { + const size_t type_length = 3; + const std::vector values = {"abc", "def", "ghi"}; + auto encoded = encode_byte_stream_split_fixed_length(values, type_length); + Slice data_slice(encoded.data(), encoded.size()); + + MutableColumnPtr column = ColumnFixedLengthObject::create(type_length); + DataTypePtr data_type = std::make_shared(); + + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + _decoder.set_type_length(type_length); + + const size_t num_values = values.size(); + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + ASSERT_TRUE(select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0).ok()); + + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + const auto* result_column = assert_cast(column.get()); + ASSERT_EQ(result_column->item_size(), type_length); + ASSERT_EQ(result_column->size(), num_values); + EXPECT_EQ(fixed_length_value(*result_column, 0), "abc"); + EXPECT_EQ(fixed_length_value(*result_column, 1), "def"); + EXPECT_EQ(fixed_length_value(*result_column, 2), "ghi"); +} + // Test decoding with filter for FLOAT type TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_float) { // Prepare test data for FLOAT type @@ -258,6 +308,38 @@ TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_and_null_float) { } } +TEST_F(ByteStreamSplitDecoderTest, test_decode_fixed_length_object_with_filter_and_null) { + const size_t type_length = 3; + const std::vector values = {"abc", "ghi"}; + auto encoded = encode_byte_stream_split_fixed_length(values, type_length); + Slice data_slice(encoded.data(), encoded.size()); + + MutableColumnPtr column = ColumnFixedLengthObject::create(type_length); + DataTypePtr data_type = std::make_shared(); + + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + _decoder.set_type_length(type_length); + + const size_t num_values = 3; + std::vector run_length_null_map = {1, 1, 1}; // data: [abc, null, ghi] + std::vector filter_data = {0, 1, 1}; // output: [null, ghi] + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + NullMap null_map; + ASSERT_TRUE( + select_vector.init(run_length_null_map, num_values, &null_map, &filter_map, 0).ok()); + + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + const auto* result_column = assert_cast(column.get()); + ASSERT_EQ(result_column->item_size(), type_length); + ASSERT_EQ(result_column->size(), 2); + EXPECT_EQ(fixed_length_value(*result_column, 1), "ghi"); + EXPECT_TRUE(null_map[0]); + EXPECT_FALSE(null_map[1]); +} + // Test decoding with filter and null for DOUBLE type TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_and_null_double) { // Prepare test data for DOUBLE type @@ -402,4 +484,4 @@ TEST_F(ByteStreamSplitDecoderTest, test_skip_value_double) { EXPECT_DOUBLE_EQ(result_column->get_data()[0], 3.0); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/format/parquet/delta_byte_array_decoder_test.cpp b/be/test/format/parquet/delta_byte_array_decoder_test.cpp index 1b039da3d2344d..a0b88d38e43098 100644 --- a/be/test/format/parquet/delta_byte_array_decoder_test.cpp +++ b/be/test/format/parquet/delta_byte_array_decoder_test.cpp @@ -20,7 +20,9 @@ #include #include "arrow/api.h" +#include "core/column/column_fixed_length_object.h" #include "core/column/column_vector.h" +#include "core/data_type/data_type_fixed_length_object.h" #include "core/data_type/data_type_number.h" #include "core/data_type/data_type_string.h" #include "format/parquet/delta_bit_pack_decoder.h" @@ -38,6 +40,16 @@ class DeltaByteArrayDecoderTest : public ::testing::Test { std::unique_ptr _decoder; }; +static void expect_fixed_length_value(const ColumnFixedLengthObject& column, size_t row, + const std::vector& expected) { + const auto value = column.get_data_at(row); + ASSERT_EQ(value.size, expected.size()); + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_EQ(static_cast(value.data[i]), expected[i]) + << "Mismatch at row " << row << ", byte " << i; + } +} + // Test basic decoding byte array functionality TEST_F(DeltaByteArrayDecoderTest, test_basic_decode_byte_array) { // Create ColumnDescriptor @@ -340,6 +352,60 @@ TEST_F(DeltaByteArrayDecoderTest, test_basic_decode_fixed_len_byte_array) { } } +TEST_F(DeltaByteArrayDecoderTest, test_basic_decode_fixed_len_byte_array_object) { + const int32_t type_length = 16; + int precision = 10; + int scale = 2; + _decoder->set_type_length(type_length); + + auto node = parquet::schema::PrimitiveNode::Make( + "test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY, + parquet::ConvertedType::DECIMAL, type_length, precision, scale); + auto descr = std::make_shared(node, 0, 0); + + std::vector> test_fixed_len_buffers = { + {0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc, + 0x61, 0x40}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00}, + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF}}; + + std::vector byte_array_values; + for (const auto& buffer : test_fixed_len_buffers) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(buffer.size()), buffer.data()}); + } + + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + auto encoded_buffer = encoder->FlushValues(); + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnFixedLengthObject::create(type_length); + DataTypePtr data_type = std::make_shared(); + + const size_t num_values = test_fixed_len_buffers.size(); + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + ASSERT_TRUE(select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0).ok()); + + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + const auto* result_column = assert_cast(column.get()); + ASSERT_EQ(result_column->item_size(), type_length); + ASSERT_EQ(result_column->size(), num_values); + for (size_t i = 0; i < num_values; ++i) { + expect_fixed_length_value(*result_column, i, test_fixed_len_buffers[i]); + } +} + // Test decoding fixed-length byte array with filter TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_with_filter) { // Configure DECIMAL type parameters @@ -418,6 +484,62 @@ TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_with_filter) } } +TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_object_with_filter_and_null) { + const int32_t type_length = 16; + int precision = 10; + int scale = 2; + _decoder->set_type_length(type_length); + + auto node = parquet::schema::PrimitiveNode::Make( + "test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY, + parquet::ConvertedType::DECIMAL, type_length, precision, scale); + auto descr = std::make_shared(node, 0, 0); + + std::vector> test_fixed_len_buffers = { + {0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc, + 0x61, 0x40}, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00}, + {0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, + 0xDE, 0xF0}}; + + std::vector byte_array_values; + for (const auto& buffer : test_fixed_len_buffers) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(buffer.size()), buffer.data()}); + } + + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + auto encoded_buffer = encoder->FlushValues(); + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnFixedLengthObject::create(type_length); + DataTypePtr data_type = std::make_shared(); + + const size_t num_values = 4; + std::vector run_length_null_map = {2, 1, 1}; // data: [Data 1, Data 2, null, Data 4] + std::vector filter_data = {1, 0, 1, 0}; // output: [Data 1, null] + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + NullMap null_map; + ASSERT_TRUE( + select_vector.init(run_length_null_map, num_values, &null_map, &filter_map, 0).ok()); + + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + const auto* result_column = assert_cast(column.get()); + ASSERT_EQ(result_column->item_size(), type_length); + ASSERT_EQ(result_column->size(), 2); + expect_fixed_length_value(*result_column, 0, test_fixed_len_buffers[0]); + EXPECT_FALSE(null_map[0]); + EXPECT_TRUE(null_map[1]); +} + // Test decoding fixed-length byte array with filter and null values TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_with_filter_and_null) { // Configure DECIMAL type parameters diff --git a/be/test/format/parquet/fix_length_dict_decoder_test.cpp b/be/test/format/parquet/fix_length_dict_decoder_test.cpp index a8050663b43332..afd419c546954e 100644 --- a/be/test/format/parquet/fix_length_dict_decoder_test.cpp +++ b/be/test/format/parquet/fix_length_dict_decoder_test.cpp @@ -19,8 +19,10 @@ #include +#include "core/column/column_fixed_length_object.h" #include "core/column/column_vector.h" #include "core/custom_allocator.h" +#include "core/data_type/data_type_fixed_length_object.h" #include "core/data_type/data_type_number.h" #include "util/slice.h" @@ -48,6 +50,11 @@ class FixLengthDictDecoderTest : public ::testing::Test { size_t _type_length; }; +static std::string fixed_length_value(const ColumnFixedLengthObject& column, size_t row) { + const auto value = column.get_data_at(row); + return {value.data, value.size}; +} + // Test basic decoding functionality TEST_F(FixLengthDictDecoderTest, test_basic_decode) { MutableColumnPtr column = ColumnUInt8::create(); @@ -97,6 +104,39 @@ TEST_F(FixLengthDictDecoderTest, test_basic_decode) { EXPECT_EQ(decoded_strings[6], "banana"); } +TEST_F(FixLengthDictDecoderTest, test_decode_with_column_fixed_length_object) { + MutableColumnPtr column = ColumnFixedLengthObject::create(_type_length); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + const size_t num_values = 7; + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + ASSERT_TRUE(select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0).ok()); + + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + const auto* result_column = assert_cast(column.get()); + ASSERT_EQ(result_column->item_size(), _type_length); + ASSERT_EQ(result_column->size(), num_values); + + EXPECT_EQ(fixed_length_value(*result_column, 0), "apple "); + EXPECT_EQ(fixed_length_value(*result_column, 1), "apple "); + EXPECT_EQ(fixed_length_value(*result_column, 2), "apple "); + EXPECT_EQ(fixed_length_value(*result_column, 3), "apple "); + EXPECT_EQ(fixed_length_value(*result_column, 4), "banana"); + EXPECT_EQ(fixed_length_value(*result_column, 5), "cherry"); + EXPECT_EQ(fixed_length_value(*result_column, 6), "banana"); +} + // Test decoding with filter TEST_F(FixLengthDictDecoderTest, test_decode_with_filter) { MutableColumnPtr column = ColumnUInt8::create(); @@ -144,6 +184,43 @@ TEST_F(FixLengthDictDecoderTest, test_decode_with_filter) { EXPECT_EQ(decoded_strings[4], "banana"); } +TEST_F(FixLengthDictDecoderTest, test_decode_fixed_length_object_with_filter_and_null) { + MutableColumnPtr column = ColumnFixedLengthObject::create(_type_length); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2] + std::vector rle_data = {2, 8, 0, 3, 0b00000010, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + const size_t num_values = 7; + std::vector run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null] + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; // output: [0 0 null 2 null] + + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + NullMap null_map; + ASSERT_TRUE( + select_vector.init(run_length_null_map, num_values, &null_map, &filter_map, 0).ok()); + + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + const auto* result_column = assert_cast(column.get()); + ASSERT_EQ(result_column->item_size(), _type_length); + ASSERT_EQ(result_column->size(), 5); + + EXPECT_EQ(fixed_length_value(*result_column, 0), "apple "); + EXPECT_EQ(fixed_length_value(*result_column, 1), "apple "); + EXPECT_EQ(fixed_length_value(*result_column, 3), "cherry"); + EXPECT_FALSE(null_map[0]); + EXPECT_FALSE(null_map[1]); + EXPECT_TRUE(null_map[2]); + EXPECT_FALSE(null_map[3]); + EXPECT_TRUE(null_map[4]); +} + // Test decoding with filter and null TEST_F(FixLengthDictDecoderTest, test_decode_with_filter_and_null) { MutableColumnPtr column = ColumnUInt8::create(); diff --git a/be/test/format/parquet/fix_length_plain_decoder_test.cpp b/be/test/format/parquet/fix_length_plain_decoder_test.cpp index 5228074b1e382e..78b992c2a36416 100644 --- a/be/test/format/parquet/fix_length_plain_decoder_test.cpp +++ b/be/test/format/parquet/fix_length_plain_decoder_test.cpp @@ -19,7 +19,9 @@ #include +#include "core/column/column_fixed_length_object.h" #include "core/column/column_vector.h" +#include "core/data_type/data_type_fixed_length_object.h" #include "core/data_type/data_type_number.h" #include "util/slice.h" @@ -34,6 +36,11 @@ class FixLengthPlainDecoderTest : public ::testing::Test { size_t _type_length; }; +static std::string fixed_length_value(const ColumnFixedLengthObject& column, size_t row) { + const auto& value = column.get_data_at(row); + return std::string(value.data, value.size); +} + // Test basic decoding functionality TEST_F(FixLengthPlainDecoderTest, test_basic_decode) { // Prepare test data: create fixed-length integer values @@ -74,6 +81,39 @@ TEST_F(FixLengthPlainDecoderTest, test_basic_decode) { EXPECT_EQ(result_column->get_data()[2], 789); } +TEST_F(FixLengthPlainDecoderTest, test_decode_with_column_fixed_length_object) { + std::string values = "abcdefghijkl"; + _data = std::make_unique(values.size()); + memcpy(_data.get(), values.data(), values.size()); + + _data_slice = Slice(_data.get(), values.size()); + _type_length = 4; + + FixLengthPlainDecoder decoder; + decoder.set_type_length(_type_length); + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + MutableColumnPtr column = ColumnFixedLengthObject::create(_type_length); + DataTypePtr data_type = std::make_shared(); + + size_t num_values = 3; + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + ASSERT_TRUE(select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0).ok()); + + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + ASSERT_EQ(column->size(), num_values); + const auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->item_size(), _type_length); + EXPECT_EQ(fixed_length_value(*result_column, 0), "abcd"); + EXPECT_EQ(fixed_length_value(*result_column, 1), "efgh"); + EXPECT_EQ(fixed_length_value(*result_column, 2), "ijkl"); +} + // Test decoding with filter TEST_F(FixLengthPlainDecoderTest, test_decode_with_filter) { // Prepare test data: create fixed-length integer values @@ -113,6 +153,42 @@ TEST_F(FixLengthPlainDecoderTest, test_decode_with_filter) { EXPECT_EQ(result_column->get_data()[1], 789); } +TEST_F(FixLengthPlainDecoderTest, test_decode_fixed_length_object_with_filter_and_null) { + std::string values = "abcdefgh"; + _data = std::make_unique(values.size()); + memcpy(_data.get(), values.data(), values.size()); + + _data_slice = Slice(_data.get(), values.size()); + _type_length = 4; + + FixLengthPlainDecoder decoder; + decoder.set_type_length(_type_length); + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + MutableColumnPtr column = ColumnFixedLengthObject::create(_type_length); + DataTypePtr data_type = std::make_shared(); + + size_t num_values = 3; + std::vector run_length_null_map = {1, 1, 1}; // data: [abcd, null, efgh] + std::vector filter_data = {0, 1, 1}; // output: [null, efgh] + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + NullMap null_map; + ASSERT_TRUE( + select_vector.init(run_length_null_map, num_values, &null_map, &filter_map, 0).ok()); + + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + ASSERT_EQ(column->size(), 2); + const auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->item_size(), _type_length); + EXPECT_EQ(fixed_length_value(*result_column, 1), "efgh"); + EXPECT_EQ(null_map.size(), 2); + EXPECT_TRUE(null_map[0]); + EXPECT_FALSE(null_map[1]); +} + // Test decoding with filter and null TEST_F(FixLengthPlainDecoderTest, test_decode_with_filter_and_null) { // Prepare test data: create fixed-length integer values From d1297be292b63fbacf583ddd869ba04056f9eb04 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Mon, 18 May 2026 12:38:11 +0800 Subject: [PATCH 04/11] [fix](be) Restore block columns on COW mutation errors Issue Number: close #xxx Related PR: #63001 Problem Summary: A few COW mutation call sites still manually moved column owners out of a live Block and restored them only at the end of the local success path. In RowIdStorageReader::read_doris_format_row(), seek_and_read_by_rowid() may return an error after moving one result column, leaving result_block with a moved-from column. StreamingAggLocalState::_pre_agg_with_serialized_key() had the same restore-on-error risk in the mem-reuse output path when streaming_agg_serialize_to_column() returned an error. This change uses Block::mutate_column_scoped() for rowid single-column owner slots and Block::mutate_columns_scoped() for the streaming pre-agg mem-reuse output block, so every Status return path restores the live Block. Adjacent rowid single-column append paths were also moved to the same scoped owner-slot pattern for consistency. None - Test: Unit Test - ./run-be-ut.sh --run --filter=StreamingAggOperatorTest.*:BlockTest.ScopedMutableColumnRestoreOnErrorDetachSharedAndCreateMissingColumn:BlockTest.ScopedMutableColumnsRestoreOnErrorAndDetachSharedColumn - Behavior changed: No - Does this need documentation: No --- .../streaming_aggregation_operator.cpp | 53 +++++++++---------- be/src/exec/rowid_fetcher.cpp | 9 ++-- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/be/src/exec/operator/streaming_aggregation_operator.cpp b/be/src/exec/operator/streaming_aggregation_operator.cpp index 49eba560e4bb19..b3e789e6e0f8a2 100644 --- a/be/src/exec/operator/streaming_aggregation_operator.cpp +++ b/be/src/exec/operator/streaming_aggregation_operator.cpp @@ -368,17 +368,26 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::Block* in_blo } bool mem_reuse = p._make_nullable_keys.empty() && out_block->mem_reuse(); + if (mem_reuse) { + auto columns_guard = out_block->mutate_columns_scoped(); + MutableColumns& columns = columns_guard.mutable_columns(); + for (int i = 0; i != _aggregate_evaluators.size(); ++i) { + SCOPED_TIMER(_insert_values_to_column_timer); + RETURN_IF_ERROR(_aggregate_evaluators[i]->streaming_agg_serialize_to_column( + in_block, columns[i + key_size], rows, _agg_arena_pool)); + } + for (int i = 0; i < key_size; ++i) { + columns[i]->insert_range_from(*key_columns[i], 0, rows); + } + return Status::OK(); + } + std::vector data_types; MutableColumns value_columns; for (int i = 0; i < _aggregate_evaluators.size(); ++i) { auto data_type = _aggregate_evaluators[i]->function()->get_serialized_type(); - if (mem_reuse) { - value_columns.emplace_back(IColumn::mutate( - std::move(out_block->get_by_position(i + key_size).column))); - } else { - value_columns.emplace_back( - _aggregate_evaluators[i]->function()->create_serialize_column()); - } + value_columns.emplace_back( + _aggregate_evaluators[i]->function()->create_serialize_column()); data_types.emplace_back(data_type); } @@ -388,28 +397,16 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::Block* in_blo in_block, value_columns[i], rows, _agg_arena_pool)); } - if (!mem_reuse) { - ColumnsWithTypeAndName columns_with_schema; - for (int i = 0; i < key_size; ++i) { - columns_with_schema.emplace_back(key_columns[i]->clone_resized(rows), - _probe_expr_ctxs[i]->root()->data_type(), - _probe_expr_ctxs[i]->root()->expr_name()); - } - for (int i = 0; i < value_columns.size(); ++i) { - columns_with_schema.emplace_back(std::move(value_columns[i]), data_types[i], ""); - } - out_block->swap(Block(columns_with_schema)); - } else { - MutableColumns columns(out_block->columns()); - for (int i = 0; i < key_size; ++i) { - columns[i] = IColumn::mutate(std::move(out_block->get_by_position(i).column)); - columns[i]->insert_range_from(*key_columns[i], 0, rows); - } - for (int i = 0; i < value_columns.size(); ++i) { - columns[key_size + i] = std::move(value_columns[i]); - } - out_block->set_columns(std::move(columns)); + ColumnsWithTypeAndName columns_with_schema; + for (int i = 0; i < key_size; ++i) { + columns_with_schema.emplace_back(key_columns[i]->clone_resized(rows), + _probe_expr_ctxs[i]->root()->data_type(), + _probe_expr_ctxs[i]->root()->expr_name()); + } + for (int i = 0; i < value_columns.size(); ++i) { + columns_with_schema.emplace_back(std::move(value_columns[i]), data_types[i], ""); } + out_block->swap(Block(columns_with_schema)); } else { bool need_agg = true; if (need_do_sort_limit != 1) { diff --git a/be/src/exec/rowid_fetcher.cpp b/be/src/exec/rowid_fetcher.cpp index 34e124c421967e..dc6d7822d354be 100644 --- a/be/src/exec/rowid_fetcher.cpp +++ b/be/src/exec/rowid_fetcher.cpp @@ -192,11 +192,11 @@ Status RowIDFetcher::_merge_rpc_results(const PMultiGetRequest& request, partial_block.dump_types()); } else { for (int i = 0; i < output_block->columns(); ++i) { - auto column = IColumn::mutate(std::move(output_block->get_by_position(i).column)); + auto column_guard = output_block->mutate_column_scoped(i); + MutableColumnPtr& column = column_guard.mutable_column(); column->insert_range_from( *partial_block.get_by_position(i).column->convert_to_full_column_if_const(), 0, partial_block.rows()); - output_block->replace_by_position(i, std::move(column)); } } return Status::OK(); @@ -370,9 +370,10 @@ struct DorisFormatReadBatch { static void scatter_scan_blocks_to_result_block( const std::vector>& row_id_block_idx, - std::vector& scan_blocks, Block& result_block) { + const std::vector& scan_blocks, Block& result_block) { for (size_t column_id = 0; column_id < result_block.columns(); ++column_id) { - auto dst_col = const_cast(result_block.get_by_position(column_id).column.get()); + auto dst_col_guard = result_block.mutate_column_scoped(column_id); + MutableColumnPtr& dst_col = dst_col_guard.mutable_column(); std::vector scan_src_columns; scan_src_columns.reserve(row_id_block_idx.size()); From a3fe26acbc8bc46e2353af464ee66345b58bf1fe Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Mon, 18 May 2026 17:00:40 +0800 Subject: [PATCH 05/11] fix --- be/src/core/block/block.cpp | 91 ++++++++++++++-- be/src/core/block/block.h | 44 ++++++-- be/src/core/column/column.h | 7 ++ be/src/core/column/column_array.h | 4 +- be/src/core/column/column_map.h | 4 +- be/src/core/column/column_nullable.h | 4 +- be/src/core/cow.h | 7 +- be/test/core/block/block_test.cpp | 151 +++++++++++++++++++++++++++ 8 files changed, 287 insertions(+), 25 deletions(-) diff --git a/be/src/core/block/block.cpp b/be/src/core/block/block.cpp index 887dd48a349679..99c8ece8b512e1 100644 --- a/be/src/core/block/block.cpp +++ b/be/src/core/block/block.cpp @@ -79,6 +79,51 @@ template void clear_blocks(moodycamel::ConcurrentQueue&, template void clear_blocks(moodycamel::ConcurrentQueue&, RuntimeProfile::Counter* memory_used_counter); +namespace { + +// The no-clone fast path is only safe when the whole column tree is uniquely +// owned. A composite column with shared children still needs COW detachment. +bool is_recursively_exclusive(const IColumn& column) { + if (!column.is_exclusive()) { + return false; + } + + bool exclusive = true; + IColumn::ColumnCallback callback = [&](IColumn::WrappedPtr& subcolumn) { + if (!exclusive) { + return; + } + const ColumnPtr& subcolumn_ptr = const_cast(subcolumn); + DCHECK(subcolumn_ptr); + exclusive = is_recursively_exclusive(*subcolumn_ptr); + }; + // `for_each_subcolumn` only exposes a mutable callback type. This callback + // only reads the wrapped pointers and never calls the non-const accessors. + const_cast(column).for_each_subcolumn(callback); + return exclusive; +} + +// Acquire one live Block slot transactionally. Shared columns are detached while +// the original slot is still intact, so a clone failure cannot leave Block with +// a moved-from/null column. Exclusive column trees keep the stealing fast path. +MutableColumnPtr scoped_mutate_column(ColumnPtr& column, const DataTypePtr& type) { + DCHECK(type); + if (!column) { + return type->create_column(); + } + + MutableColumnPtr mutable_column; + if (is_recursively_exclusive(*column)) { + mutable_column = std::move(*column).mutate(); + } else { + mutable_column = IColumn::mutate(column); + } + column = nullptr; + return mutable_column; +} + +} // namespace + Block::Block(std::initializer_list il) : data {il} {} Block::Block(ColumnsWithTypeAndName data_) : data {std::move(data_)} {} @@ -576,8 +621,24 @@ Columns Block::get_columns_and_convert() { return columns; } -Block::ScopedMutableColumns::ScopedMutableColumns(Block& block) - : _block(&block), _columns(std::move(block).mutate_columns()) {} +Block::ScopedMutableColumns::ScopedMutableColumns(Block& block) : _block(&block) { + const size_t num_columns = block.data.size(); + _columns.resize(num_columns); + size_t acquired_columns = 0; + try { + for (; acquired_columns < num_columns; ++acquired_columns) { + auto& column_with_type_and_name = block.data[acquired_columns]; + _columns[acquired_columns] = scoped_mutate_column(column_with_type_and_name.column, + column_with_type_and_name.type); + } + } catch (...) { + for (size_t i = 0; i < acquired_columns; ++i) { + block.data[i].column = std::move(_columns[i]); + } + _block = nullptr; + throw; + } +} Block::ScopedMutableColumns::~ScopedMutableColumns() { restore(); @@ -606,6 +667,12 @@ const std::string& Block::ScopedMutableColumns::get_name_by_position(size_t posi return _block->get_by_position(position).name; } +MutableColumns Block::ScopedMutableColumns::release() { + DCHECK(_block != nullptr); + _block = nullptr; + return std::move(_columns); +} + void Block::ScopedMutableColumns::restore() { if (_block != nullptr) { _block->set_columns(std::move(_columns)); @@ -618,9 +685,8 @@ Block::ScopedMutableColumn::ScopedMutableColumn(Block& block, size_t position) DCHECK_LT(_position, _block->data.size()); auto& column_with_type_and_name = _block->data[_position]; DCHECK(column_with_type_and_name.type); - _column = column_with_type_and_name.column - ? IColumn::mutate(std::move(column_with_type_and_name.column)) - : column_with_type_and_name.type->create_column(); + _column = + scoped_mutate_column(column_with_type_and_name.column, column_with_type_and_name.type); } Block::ScopedMutableColumn::~ScopedMutableColumn() { @@ -659,6 +725,17 @@ Block::ScopedMutableColumn Block::mutate_column_scoped(size_t position) & { return ScopedMutableColumn(*this, position); } +ScopedMutableBlock::ScopedMutableBlock(Block* block) { + DCHECK(block != nullptr); + DataTypes data_types = block->get_data_types(); + std::vector names = block->get_names(); + auto columns_guard = block->mutate_columns_scoped(); + _mutable_block.data_types() = std::move(data_types); + _mutable_block.get_names() = std::move(names); + _mutable_block.set_mutable_columns(columns_guard.release()); + _block = block; +} + MutableColumns Block::mutate_columns() && { size_t num_columns = data.size(); MutableColumns columns(num_columns); @@ -727,7 +804,7 @@ void Block::clear() { data.clear(); } -void Block::clear_column_data(int64_t column_size) noexcept { +void Block::clear_column_data(int64_t column_size) { SCOPED_SKIP_MEMORY_CHECK(); // data.size() greater than column_size, means here have some // function exec result in block, need erase it here @@ -747,7 +824,7 @@ void Block::clear_column_data(int64_t column_size) noexcept { } } -void Block::clear_column_data(const std::vector& columns_to_clear) noexcept { +void Block::clear_column_data(const std::vector& columns_to_clear) { SCOPED_SKIP_MEMORY_CHECK(); for (auto col : columns_to_clear) { DCHECK_LT(col, data.size()); diff --git a/be/src/core/block/block.h b/be/src/core/block/block.h index ef05274cf636a6..3b97cc0fcf86ee 100644 --- a/be/src/core/block/block.h +++ b/be/src/core/block/block.h @@ -212,6 +212,10 @@ class Block { /** Get empty columns with the same types as in block. */ MutableColumns clone_empty_columns() const; + // RAII owner for mutating columns borrowed from a live Block. While the + // guard is alive, the Block's column slots are moved out and column data + // must be accessed through mutable_columns(). The guard restores columns on + // destruction, so use it when the caller may exit early after detaching. class ScopedMutableColumns { public: explicit ScopedMutableColumns(Block& block); @@ -227,6 +231,11 @@ class Block { const DataTypePtr& get_datatype_by_position(size_t position) const; const std::string& get_name_by_position(size_t position) const; + // Transfer the borrowed owners to another RAII object that will restore + // them. After release(), the original Block remains without columns + // until that owner restores them. Normal callers should let this guard + // restore on destruction. + MutableColumns release(); void restore(); private: @@ -234,6 +243,8 @@ class Block { MutableColumns _columns; }; + // Single-column variant for localized mutation of a live Block slot. The + // selected slot is unavailable from the Block until this guard restores it. class ScopedMutableColumn { public: ScopedMutableColumn(Block& block, size_t position); @@ -259,11 +270,13 @@ class Block { MutableColumns mutate_columns() &&; MutableColumns mutate_columns() & = delete; - /** Get columns from a live block for mutation and restore them on every exit path. */ + /** Temporarily mutate a live Block's columns. The returned guard owns the columns and + * restores them on destruction; prefer this over manual move/writeback. + */ ScopedMutableColumns mutate_columns_scoped() &; ScopedMutableColumns mutate_columns_scoped() && = delete; - /** Get one column from a live block for mutation and restore it on every exit path. */ + /** Temporarily mutate one live Block column; use when only one slot needs ownership. */ ScopedMutableColumn mutate_column_scoped(size_t position) &; ScopedMutableColumn mutate_column_scoped(size_t position) && = delete; @@ -276,10 +289,11 @@ class Block { // Shuffle columns in place based on the result_column_ids void shuffle_columns(const std::vector& result_column_ids); - // Default column size = -1 means clear all column in block - // Else clear column [0, column_size) delete column [column_size, data.size) - void clear_column_data(int64_t column_size = -1) noexcept; - void clear_column_data(const std::vector& columns_to_clear) noexcept; + // column_size == -1 clears all columns; otherwise clear [0, column_size) + // and drop the rest. Shared columns are detached through clone_empty(), so + // allocation or clone failures propagate. + void clear_column_data(int64_t column_size = -1); + void clear_column_data(const std::vector& columns_to_clear); MOCK_FUNCTION bool mem_reuse() { return !data.empty(); } @@ -434,6 +448,10 @@ class MutableBlock { std::vector _names; public: + // Build from a consumed Block. This has no restore contract: the source + // Block is left without columns and must not be used as a live output block. + // For caller-owned live Blocks, use ScopedMutableBlock or + // mutate_columns_scoped() instead. static MutableBlock build_mutable_block(Block&& block) { return MutableBlock(std::move(block)); } @@ -448,6 +466,8 @@ class MutableBlock { _data_types(std::move(m_block._data_types)), _names(std::move(m_block._names)) {} + // Consumes block columns and converts them to mutable columns recursively. + // This constructor is for temporary/owned Blocks only. MutableBlock(Block&& block) : _columns(std::move(block).mutate_columns()), _data_types(block.get_data_types()), @@ -643,7 +663,8 @@ class MutableBlock { _names.clear(); } - // columns resist. columns' inner data removed. + // Clear owned mutable columns in place. MutableBlock already owns its + // columns exclusively, so this does not perform COW detaching or cloning. void clear_column_data() noexcept; size_t allocated_bytes() const; @@ -663,13 +684,14 @@ class MutableBlock { std::string dump_names() const; }; +// RAII adapter for code that wants the MutableBlock API over a live Block. It +// owns only the temporary mutable columns and restores them to the Block on +// destruction. While the adapter is alive, read/write column data through +// mutable_block()/mutable_columns(); the Block's column slots are moved out. class ScopedMutableBlock { public: ScopedMutableBlock() = delete; - explicit ScopedMutableBlock(Block* block) : _block(block) { - DCHECK(_block != nullptr); - _mutable_block = MutableBlock(std::move(*_block)); - } + explicit ScopedMutableBlock(Block* block); ~ScopedMutableBlock() { restore(); } ScopedMutableBlock(const ScopedMutableBlock&) = delete; diff --git a/be/src/core/column/column.h b/be/src/core/column/column.h index c48c7a55da84cd..6a443b6cac6d92 100644 --- a/be/src/core/column/column.h +++ b/be/src/core/column/column.h @@ -579,6 +579,9 @@ class IColumn : public COW { return false; } + // Recursively make a mutable column tree. Use this rvalue member when the + // current column object is being consumed. Shared nodes are cloned, while + // exclusive nodes are reused through the COW fast path. MutablePtr mutate() const&& { MutablePtr res = shallow_mutate(); res->for_each_subcolumn([](WrappedPtr& subcolumn) { @@ -588,6 +591,10 @@ class IColumn : public COW { return res; } + // COW entry point for a ColumnPtr. Passing the pointer by value keeps the + // original owner alive until the top-level detach succeeds; passing + // std::move(ptr) explicitly consumes that owner. Subcolumns are still + // recursively detached as needed. static MutablePtr mutate(Ptr ptr) { MutablePtr res = ptr->shallow_mutate(); /// Now use_count is 2. ptr.reset(); /// Reset use_count to 1. diff --git a/be/src/core/column/column_array.h b/be/src/core/column/column_array.h index dba4f046ec2350..06eb3d2123e6a5 100644 --- a/be/src/core/column/column_array.h +++ b/be/src/core/column/column_array.h @@ -96,8 +96,8 @@ class ColumnArray final : public COWHelper { Offsets64; public: - /** Create immutable column using immutable arguments. This arguments may be shared with other columns. - * Use IColumn::mutate in order to make mutable column and mutate shared nested columns. + /** Create a column from immutable/shared subcolumns without cloning them. + * Call IColumn::mutate before modifying the returned column tree. */ using Base = COWHelper; diff --git a/be/src/core/column/column_map.h b/be/src/core/column/column_map.h index 25ce7cfbbd4c2e..fa67caa654e25a 100644 --- a/be/src/core/column/column_map.h +++ b/be/src/core/column/column_map.h @@ -53,8 +53,8 @@ class Arena; */ class ColumnMap final : public COWHelper { public: - /** Create immutable column using immutable arguments. This arguments may be shared with other columns. - * Use IColumn::mutate in order to make mutable column and mutate shared nested columns. + /** Create a column from immutable/shared subcolumns without cloning them. + * Call IColumn::mutate before modifying the returned column tree. */ using Base = COWHelper; using COffsets = ColumnArray::ColumnOffsets; diff --git a/be/src/core/column/column_nullable.h b/be/src/core/column/column_nullable.h index 01cdddf776effe..a81e3e6e1c54a4 100644 --- a/be/src/core/column/column_nullable.h +++ b/be/src/core/column/column_nullable.h @@ -60,8 +60,8 @@ class ColumnNullable final : public COWHelper { ColumnNullable(const ColumnNullable&) = default; public: - /** Create immutable column using immutable arguments. This arguments may be shared with other columns. - * Use IColumn::mutate in order to make mutable column and mutate shared nested columns. + /** Create a column from immutable/shared subcolumns without cloning them. + * Call IColumn::mutate before modifying the returned column tree. */ using Base = COWHelper; static MutablePtr create(const ColumnPtr& nested_column_, const ColumnPtr& null_map_) { diff --git a/be/src/core/cow.h b/be/src/core/cow.h index a0dd93bf545d20..4fb6059a1fc111 100644 --- a/be/src/core/cow.h +++ b/be/src/core/cow.h @@ -316,6 +316,10 @@ class COW { public: MutablePtr mutate() const&& { return shallow_mutate(); } + // Ownership assertion for callers that have already proved this object is + // uniquely owned. This does not detach shared owners; use a type-specific + // COW entry point (for example IColumn::mutate) when the pointer may be + // shared. MutablePtr assume_mutable() const { if (this->use_count() > 1) { throw Exception(ErrorCode::INTERNAL_ERROR, "COW::assume_mutable: use_count() > 1"); @@ -323,6 +327,7 @@ class COW { return const_cast(this)->get_ptr(); } + // Reference variant of assume_mutable(), with the same ownership contract. Derived& assume_mutable_ref() const { if (this->use_count() > 1) { throw Exception(ErrorCode::INTERNAL_ERROR, "COW::assume_mutable: use_count() > 1"); @@ -461,4 +466,4 @@ class COWHelper : public Base { return MutablePtr(static_cast(Base::shallow_mutate().get())); } }; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/core/block/block_test.cpp b/be/test/core/block/block_test.cpp index 6d9389f26c9947..bdceb868cf53de 100644 --- a/be/test/core/block/block_test.cpp +++ b/be/test/core/block/block_test.cpp @@ -35,12 +35,14 @@ #include "agent/be_exec_version_manager.h" #include "common/config.h" +#include "common/exception.h" #include "common/object_pool.h" #include "core/column/column.h" #include "core/column/column_array.h" #include "core/column/column_complex.h" #include "core/column/column_const.h" #include "core/column/column_decimal.h" +#include "core/column/column_dummy.h" #include "core/column/column_nullable.h" #include "core/column/column_string.h" #include "core/column/column_vector.h" @@ -67,6 +69,47 @@ namespace doris { +namespace { + +class ThrowOnCloneColumn final : public COWHelper { +private: + friend class COWHelper; + + ThrowOnCloneColumn(size_t size, bool throw_on_clone, bool throw_on_clone_empty) + : _throw_on_clone(throw_on_clone), _throw_on_clone_empty(throw_on_clone_empty) { + s = size; + } + + ThrowOnCloneColumn(const ThrowOnCloneColumn&) = default; + + MutableColumnPtr clone() const override { + if (_throw_on_clone) { + throw Exception(ErrorCode::INTERNAL_ERROR, "injected clone failure"); + } + return MutableColumnPtr(new ThrowOnCloneColumn(*this)); + } + +public: + std::string get_name() const override { return "ThrowOnClone"; } + + MutableColumnPtr clone_dummy(size_t size) const override { + if (_throw_on_clone_empty) { + throw Exception(ErrorCode::INTERNAL_ERROR, "injected clone_empty failure"); + } + return ThrowOnCloneColumn::create(size, _throw_on_clone, _throw_on_clone_empty); + } + + bool structure_equals(const IColumn& rhs) const override { + return typeid(rhs) == typeid(ThrowOnCloneColumn); + } + +private: + bool _throw_on_clone = false; + bool _throw_on_clone_empty = false; +}; + +} // namespace + void block_to_pb( const Block& block, PBlock* pblock, segment_v2::CompressionTypePB compression_type = segment_v2::CompressionTypePB::SNAPPY) { @@ -1317,6 +1360,36 @@ TEST(BlockTest, ClearSelectedColumnDataClonesSharedColumn) { EXPECT_EQ(block.get_by_position(1).column.get(), old_col1.get()); } +TEST(BlockTest, ClearColumnDataPropagatesSharedCloneEmptyFailure) { + auto type = std::make_shared(); + auto mutable_col = ThrowOnCloneColumn::create(2, false, true); + ColumnPtr old_col = mutable_col->get_ptr(); + + Block block; + block.insert({std::move(mutable_col), type, "c0"}); + + EXPECT_THROW(block.clear_column_data(), Exception); + ASSERT_NE(block.get_by_position(0).column.get(), nullptr); + EXPECT_EQ(block.get_by_position(0).column.get(), old_col.get()); + EXPECT_EQ(block.get_by_position(0).column->size(), 2); + EXPECT_EQ(old_col->size(), 2); +} + +TEST(BlockTest, ClearSelectedColumnDataPropagatesSharedCloneEmptyFailure) { + auto type = std::make_shared(); + auto mutable_col = ThrowOnCloneColumn::create(2, false, true); + ColumnPtr old_col = mutable_col->get_ptr(); + + Block block; + block.insert({std::move(mutable_col), type, "c0"}); + + EXPECT_THROW(block.clear_column_data(std::vector {0}), Exception); + ASSERT_NE(block.get_by_position(0).column.get(), nullptr); + EXPECT_EQ(block.get_by_position(0).column.get(), old_col.get()); + EXPECT_EQ(block.get_by_position(0).column->size(), 2); + EXPECT_EQ(old_col->size(), 2); +} + TEST(BlockTest, ScopedMutableColumnsRestoreOnErrorAndDetachSharedColumn) { auto type = std::make_shared(); auto mutable_col = ColumnInt32::create(); @@ -1355,6 +1428,35 @@ TEST(BlockTest, ScopedMutableColumnsReadSchemaFromLiveBlock) { EXPECT_EQ(columns_guard.get_name_by_position(0), "c0"); } +TEST(BlockTest, ScopedMutableColumnsConstructorFailureRestoresAcquiredColumns) { + auto type = std::make_shared(); + auto mutable_col = ColumnInt32::create(); + mutable_col->insert_value(1); + mutable_col->insert_value(2); + const IColumn* old_col = mutable_col.get(); + + auto throwing_col = ThrowOnCloneColumn::create(2, true, false); + ColumnPtr old_throwing_col = throwing_col->get_ptr(); + + Block block; + block.insert({std::move(mutable_col), type, "c0"}); + block.insert({std::move(throwing_col), type, "throwing"}); + + EXPECT_THROW( + [&]() { + auto columns_guard = block.mutate_columns_scoped(); + static_cast(columns_guard); + }(), + Exception); + + ASSERT_NE(block.get_by_position(0).column.get(), nullptr); + ASSERT_NE(block.get_by_position(1).column.get(), nullptr); + EXPECT_EQ(block.get_by_position(0).column.get(), old_col); + EXPECT_EQ(block.get_by_position(0).column->size(), 2); + EXPECT_EQ(block.get_by_position(1).column.get(), old_throwing_col.get()); + EXPECT_EQ(block.get_by_position(1).column->size(), 2); +} + TEST(BlockTest, ScopedMutableColumnRestoreOnErrorDetachSharedAndCreateMissingColumn) { auto type = std::make_shared(); auto mutable_col = ColumnInt32::create(); @@ -1388,6 +1490,26 @@ TEST(BlockTest, ScopedMutableColumnRestoreOnErrorDetachSharedAndCreateMissingCol EXPECT_EQ(block.get_by_position(1).column->size(), 1); } +TEST(BlockTest, ScopedMutableColumnConstructorFailureKeepsOriginalColumn) { + auto type = std::make_shared(); + auto throwing_col = ThrowOnCloneColumn::create(2, true, false); + ColumnPtr old_throwing_col = throwing_col->get_ptr(); + + Block block; + block.insert({std::move(throwing_col), type, "throwing"}); + + EXPECT_THROW( + [&]() { + auto column_guard = block.mutate_column_scoped(0); + static_cast(column_guard); + }(), + Exception); + + ASSERT_NE(block.get_by_position(0).column.get(), nullptr); + EXPECT_EQ(block.get_by_position(0).column.get(), old_throwing_col.get()); + EXPECT_EQ(block.get_by_position(0).column->size(), 2); +} + TEST(BlockTest, ScopedMutableBlockRestoreOnErrorAndDetachSharedColumn) { auto type = std::make_shared(); auto mutable_col = ColumnInt32::create(); @@ -1410,4 +1532,33 @@ TEST(BlockTest, ScopedMutableBlockRestoreOnErrorAndDetachSharedColumn) { EXPECT_NE(block.get_by_position(0).column.get(), old_col.get()); } +TEST(BlockTest, ScopedMutableBlockConstructorFailureRestoresBlockColumns) { + auto type = std::make_shared(); + auto mutable_col = ColumnInt32::create(); + mutable_col->insert_value(1); + mutable_col->insert_value(2); + const IColumn* old_col = mutable_col.get(); + + auto throwing_col = ThrowOnCloneColumn::create(2, true, false); + ColumnPtr old_throwing_col = throwing_col->get_ptr(); + + Block block; + block.insert({std::move(mutable_col), type, "c0"}); + block.insert({std::move(throwing_col), type, "throwing"}); + + EXPECT_THROW( + [&]() { + ScopedMutableBlock scoped_block(&block); + static_cast(scoped_block); + }(), + Exception); + + ASSERT_NE(block.get_by_position(0).column.get(), nullptr); + ASSERT_NE(block.get_by_position(1).column.get(), nullptr); + EXPECT_EQ(block.get_by_position(0).column.get(), old_col); + EXPECT_EQ(block.get_by_position(0).column->size(), 2); + EXPECT_EQ(block.get_by_position(1).column.get(), old_throwing_col.get()); + EXPECT_EQ(block.get_by_position(1).column->size(), 2); +} + } // namespace doris From 6372fb6d4f0cf63ed8ebab63532d3459c9d62e6c Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Mon, 18 May 2026 21:07:48 +0800 Subject: [PATCH 06/11] [fix](be) Use scoped mutable block in merge tests Issue Number: close #xxx Related PR: #63001 Problem Summary: BE UT merges PR head into the latest master before building tests. After master added string-overflow MutableBlock merge tests, those tests still used the removed MutableBlock(Block*) live-block constructor. That constructor is intentionally unavailable in the new COW model because live output blocks need scoped restore-on-error ownership. This updates the tests to use ScopedMutableBlock while preserving the checked overflow and ignore-overflow assertions. None - Test: Unit Test - ./run-be-ut.sh --run --filter=BlockTest.merge_returns_error_when_checked_string_append_exceeds_limit:BlockTest.merge_ignore_overflow_keeps_owned_accumulation_convertible -j100 - Behavior changed: No - Does this need documentation: No --- be/test/core/block/block_test.cpp | 42 +++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/be/test/core/block/block_test.cpp b/be/test/core/block/block_test.cpp index bdceb868cf53de..e02e4ac43cd9ab 100644 --- a/be/test/core/block/block_test.cpp +++ b/be/test/core/block/block_test.cpp @@ -998,6 +998,48 @@ TEST(BlockTest, clear_blocks) { } } +TEST(BlockTest, merge_returns_error_when_checked_string_append_exceeds_limit) { + auto input_block = create_string_block({"abcde", "fghij"}); + auto output_block = create_string_block({}); + + auto string_overflow_size = config::string_overflow_size; + config::string_overflow_size = 9; + Defer defer([string_overflow_size]() { config::string_overflow_size = string_overflow_size; }); + + auto status = [&]() { + ScopedMutableBlock scoped_mutable_block(&output_block); + return scoped_mutable_block.mutable_block().merge(input_block); + }(); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("string column length is too large"), std::string::npos) + << status.to_string(); + + ASSERT_EQ(output_block.rows(), 0); + ASSERT_FALSE(output_block.get_by_position(0).column->is_column_string64()); +} + +TEST(BlockTest, merge_ignore_overflow_keeps_owned_accumulation_convertible) { + auto input_block = create_string_block({"abcde", "fghij"}); + auto output_block = create_string_block({}); + + auto string_overflow_size = config::string_overflow_size; + config::string_overflow_size = 9; + Defer defer([string_overflow_size]() { config::string_overflow_size = string_overflow_size; }); + + ColumnPtr converted_column; + { + ScopedMutableBlock scoped_mutable_block(&output_block); + auto& mutable_block = scoped_mutable_block.mutable_block(); + auto status = mutable_block.merge_ignore_overflow(input_block); + ASSERT_TRUE(status.ok()) << status.to_string(); + converted_column = mutable_block.get_column_by_position(0)->convert_column_if_overflow(); + } + ASSERT_TRUE(converted_column->is_column_string64()); + ASSERT_EQ(converted_column->size(), 2); + EXPECT_EQ(converted_column->get_data_at(0).to_string(), "abcde"); + EXPECT_EQ(converted_column->get_data_at(1).to_string(), "fghij"); +} + TEST(BlockTest, replace_by_position) { auto block = ColumnHelper::create_block({1, 2, 3}); block.insert(0, ColumnHelper::create_column_with_name({"a", "b", "c"})); From 17cb5ea586ed71961b688781500ac8d26e3d760b Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Wed, 20 May 2026 21:32:57 +0800 Subject: [PATCH 07/11] [fix](be) Remove local COW audit doc from branch ### What problem does this PR solve? Issue Number: close #xxx Related PR: #63001 Problem Summary: The branch accidentally included a local COW audit document under docs/dev. The document is useful as local working notes, but it should not be part of the BE COW implementation diff. Remove it from the tracked branch state while leaving local notes outside the commit. ### Release note None ### Check List (For Author) - Test: No need to test (document removal only) - Behavior changed: No - Does this need documentation: No --- docs/dev/be-cow-assume-mutable-audit.md | 397 ------------------------ 1 file changed, 397 deletions(-) delete mode 100644 docs/dev/be-cow-assume-mutable-audit.md diff --git a/docs/dev/be-cow-assume-mutable-audit.md b/docs/dev/be-cow-assume-mutable-audit.md deleted file mode 100644 index bd7e01267b5a7f..00000000000000 --- a/docs/dev/be-cow-assume-mutable-audit.md +++ /dev/null @@ -1,397 +0,0 @@ -# BE COW assume_mutable audit - -Snapshot: 2026-05-09, branch `cow`. - -`assume_mutable()` is now an ownership assertion. A valid call must have a -nearby proof that the referenced `ColumnPtr` is exclusive. If exclusivity is not -local and obvious, mutate the owning handle and write it back: - -```cpp -auto column = IColumn::mutate(std::move(block.get_by_position(i).column)); -// mutate column -block.replace_by_position(i, std::move(column)); -``` - -For hot paths that append on every row, do not call `mutate()` per row. Keep a -real mutable owner such as `MutableBlock`/`MutableColumnPtr`, then materialize a -`Block` view only at the boundary. - -## Scan Commands - -Active call-site inventory: - -```bash -rg -n '\bassume_mutable(_ref)?\s*\(' be/src -S -``` - -Current result: 127 raw matches. These include API definitions, comments, and -the active call sites listed below. The current unique path list from the command -is fully covered by the table in this document. - -High-risk alias scan: - -```bash -rg -n 'get_columns\(\).*assume_mutable|assume_mutable\(\).*get_columns|const_cast<.*IColumn.*>\([^\n]*get_columns|get_columns\(\)\[[^\n]*\]\.get\(\)' be/src be/test -S -``` - -Current result: no remaining direct hit. This pattern matters because -`Block::get_columns()` copies `ColumnPtr`s and can introduce a temporary alias in -the same function before the mutable assertion. - -The direct owner-slot pattern was also scanned: - -```bash -rg -n '\.get_by_position\(.*\)\.column->assume_mutable' be/src -S -``` - -Those calls are not automatically safe; each one is classified below by its -real ownership evidence. - -## Lessons From The 2026-05-09 Recheck - -- The earlier file-level `OWNED_BLOCK` classification was too coarse. A current - scanner output block is not itself a proof if the same function first copies - its `ColumnPtr`s. -- `be/src/format/json/new_json_reader.cpp` had exactly that bug, confirmed by - the `test_hive_openx_json` external regression on the OpenX - `ignore.malformed.json=true` table: - the malformed-json helper iterated `block.get_columns()`, creating aliases, - then called `assume_mutable()`. It is now `append_null_for_malformed_json()`, - which mutates each owner slot and writes it back. Rollback paths now use - `truncate_block_to_rows()` with the same owner-writeback pattern. -- `be/src/exec/rowid_fetcher.cpp` had two unsafe patterns: appending into an - externally supplied output block with `assume_mutable()`, and writing into - `result_block.get_columns()` through `const_cast`. Both were changed to mutate - the owning slot and write back. Source scan-block columns are copied only into - stable read-only vectors. -- `be/src/core/block/block.cpp` should not be described as `OWNED_BLOCK`. - These helpers are safe because they branch on `is_exclusive()` or explicitly - clone/mutate before writeback. - -## Classification Legend - -- `LOCAL_RESULT`: the column is created locally or freshly cloned before the - call and is not published before mutation. -- `MUTATED_FIRST`: the owning `ColumnPtr` was moved through `IColumn::mutate()` - or another COW-safe producer before typed mutable access. -- `OWNED_OUTPUT`: the call writes into a scanner/operator/internal output block - whose caller contract is exclusive ownership, and this function does not - introduce another local alias before the call. -- `SUBCOLUMN_EXCLUSIVE`: the parent complex column is already exclusive, so its - nested columns are exclusive as part of the parent mutation path. -- `EXCLUSIVE_BRANCH`: the call is guarded by `is_exclusive()` or `use_count()==1` - and has a clone/mutate fallback for the shared case. -- `HELPER_CONTRACT`: API/helper accessors. Callers must prove ownership or use a - COW-safe mutate/writeback path. -- `COMMENT_ONLY`: not an active call site. -- `CHECKLIST_ONLY`: local checklist text, not code. - -## Active Call-Site Audit - -| File | Lines | Classification | Evidence / action | -| --- | --- | --- | --- | -| `be/src/core/AGENTS.md` | 18 | CHECKLIST_ONLY | Local review checklist. | -| `be/src/core/cow.h` | 312, 319, 326, 349, 355 | HELPER_CONTRACT | COW primitive API definitions and proxy operators. | -| `be/src/core/column/column_nullable.h` | 283, 391 | HELPER_CONTRACT | Mutable nested/null-map accessors assert subcolumn exclusivity. Callers that only own the parent through a shared `ColumnPtr` must mutate/write back first. | -| `be/src/core/column/column_array.cpp` | 66 | COMMENT_ONLY | Documents const access to avoid mutable assertion during construction. | -| `be/src/core/column/column_map.cpp` | 553 | COMMENT_ONLY | Documents const access after offsets writeback. | -| `be/src/core/column/column_nullable.cpp` | 380 | COMMENT_ONLY | Documents const nested access to avoid mutable assertion. | -| `be/src/core/block/block.cpp` | 659, 674, 737, 745, 775, 804, 823, 1106 | EXCLUSIVE_BRANCH | In-place block helpers clear/filter/shrink in place only when the column is exclusive; otherwise they clone, filter-return, or mutate/write back. This is the correct pattern for shared blocks. | - -| File | Lines | Classification | Evidence / action | -| --- | --- | --- | --- | -| `be/src/exprs/vruntimefilter_wrapper.cpp` | 126 | LOCAL_RESULT | `filter_column` is the runtime-filter result column passed to `change_null_to_true`. | -| `be/src/exprs/vtopn_pred.h` | 121 | LOCAL_RESULT | `result_column` is a freshly produced predicate column. | -| `be/src/exprs/vexpr_context.cpp` | 332, 375 | LOCAL_RESULT | Temporary expression result columns are cleared before reuse in the expression context. | -| `be/src/exprs/lambda_function/varray_sort_function.cpp` | 145 | OWNED_OUTPUT | The lambda block is built for the current lambda evaluation and no local `ColumnPtr` alias is introduced before mutation. | -| `be/src/exprs/lambda_function/varray_map_function.cpp` | 233, 242 | OWNED_OUTPUT | Lambda evaluation columns are local to the lambda block for this call. | -| `be/src/exprs/function/function_other_types_to_date.cpp` | 150, 154, 164, 168, 301, 305, 545, 615, 620, 1058, 1133 | LOCAL_RESULT | Function result columns are created by the execute path and filled before being returned. | -| `be/src/exprs/function/cast/cast_to_variant.h` | 97, 109, 121, 124 | LOCAL_RESULT | `col_to` is created by this cast path before defaults/null wrapping are inserted. | -| `be/src/exprs/function/function_variant_element.cpp` | 266, 268, 293 | LOCAL_RESULT | `result` is the newly created variant extraction output. | -| `be/src/exprs/function/function_variadic_arguments.h` | 64, 68, 74 | LOCAL_RESULT | `column` is created locally and assigned to the result only after writes complete. | -| `be/src/exprs/function/dictionary_util.h` | 63 | EXCLUSIVE_BRANCH | In-place filter is used only after `column->is_exclusive()`; otherwise the function replaces `column` with the clone-returning `filter()` result. | -| `be/src/exprs/function/array/function_array_with_constant.cpp` | 102 | LOCAL_RESULT | `clone` comes from `value->clone_empty()` and is filled before publication. | -| `be/src/exprs/function/array/function_array_aggregation.cpp` | 219, 231, 237, 443, 456, 462 | LOCAL_RESULT | Aggregate result column is the local destination passed by the array aggregation function. | -| `be/src/exprs/aggregate/aggregate_function_null_v2.h` | 217, 276 | LOCAL_RESULT | Destination nullable columns are newly created serialize/aggregate output. A former read-only source nested-column assertion was removed in this branch. | - -| File | Lines | Classification | Evidence / action | -| --- | --- | --- | --- | -| `be/src/exec/operator/operator.cpp` | 351, 358 | EXCLUSIVE_BRANCH | Projection helper steals/mutates input columns only after checking the source is exclusive; otherwise it materializes a replacement. | -| `be/src/exec/operator/aggregation_source_operator.cpp` | 549 | LOCAL_RESULT | `ptr = make_nullable(ptr, ...)` creates the nullable result before moving it into the output column list. | -| `be/src/exec/operator/distinct_streaming_aggregation_operator.cpp` | 210, 243 | LOCAL_RESULT | Key columns are locally materialized aggregate output columns. | -| `be/src/exec/operator/hashjoin_build_sink.cpp` | 189, 591 | LOCAL_RESULT | Join build helper constructs nullable columns locally before null-map mutation. | -| `be/src/exec/operator/join/process_hash_table_probe_impl.h` | 883 | OWNED_OUTPUT | Hash-join probe writes into its current output block. No `get_columns()` alias is introduced in this mutation path. | -| `be/src/exec/operator/nested_loop_join_probe_operator.h` | 52 | OWNED_OUTPUT | Macro clears local nested-loop join probe output columns. | -| `be/src/exec/operator/assert_num_rows_operator.cpp` | 94 | LOCAL_RESULT | Assertion operator creates and fills its output column locally. | -| `be/src/exec/sort/sorter.cpp` | 235, 240 | OWNED_OUTPUT | Sorter appends into its internal unsorted block. This is a long-lived sorter-owned block, not a per-row mutate path. | -| `be/src/exec/common/util.hpp` | 248 | SUBCOLUMN_EXCLUSIVE | Recursive helper receives a `MutableColumnPtr`; the `ColumnConst` wrapper is already mutable, so its data column is part of the same exclusive path. | -| `be/src/exec/rowid_fetcher.cpp` | 464 | LOCAL_RESULT | `result_block` is built locally from `Block(slots, request.row_locs().size())` before rowid reads. Former shared-output merge and `get_columns()`/`const_cast` paths were fixed to mutate/write back. | - -| File | Lines | Classification | Evidence / action | -| --- | --- | --- | --- | -| `be/src/format/json/new_json_reader.cpp` | 1016, 1022, 1062, 1500, 1610, 1625 | OWNED_OUTPUT | Normal JSON write/skip-bitmap paths append to the current scanner output block. The function does not copy the owner column before these calls. The previous malformed/rollback paths were changed to owner mutate/writeback helpers. | -| `be/src/format/json/new_json_reader.cpp` | 1270, 1278 | SUBCOLUMN_EXCLUSIVE | Map keys/values are subcolumns of the current exclusive map column passed into `_simdjson_write_data_to_column()`. | -| `be/src/format/column_type_convert.cpp` | 117 | LOCAL_RESULT | `_cached_src_column` is converter-owned cache state and is cleared before reuse. | -| `be/src/format/parquet/vparquet_column_reader.cpp` | 334, 377, 416, 424, 665, 673, 721, 729, 798, 806, 995 | MUTATED_FIRST | Parquet column readers mutate the destination handle or operate on reader-owned destination columns before typed mutable access. | -| `be/src/format/parquet/vparquet_column_reader.h` | 486 | OWNED_OUTPUT | Nested map reader destination column is owned by the active parquet read path. | -| `be/src/format/parquet/vparquet_group_reader.cpp` | 1061, 1071 | LOCAL_RESULT | Dictionary temporary block is local to the group reader. | -| `be/src/format/parquet/vparquet_reader.h` | 190 | OWNED_OUTPUT | TopN rowid synthesized column writes into the current parquet reader output block. | -| `be/src/format/parquet/parquet_column_convert.h` | 199, 239, 350 | MUTATED_FIRST | Conversion helpers mutate owning destination handles or converter-owned columns before typed access. The nullable null-map slice bug was fixed separately by copying from the appended source slice. | -| `be/src/format/parquet/parquet_column_convert.cpp` | 121 | LOCAL_RESULT | `_cached_src_physical_column` is converter-owned cache state. | -| `be/src/format/orc/vorc_reader.h` | 229 | OWNED_OUTPUT | TopN rowid synthesized column writes into the current ORC reader output block. | -| `be/src/format/orc/vorc_reader.cpp` | 2254 | MUTATED_FIRST | Schema-change conversion operates on the destination column for the current conversion path. | -| `be/src/format/orc/vorc_reader.cpp` | 3092, 3100 | LOCAL_RESULT | Dictionary temporary block is local to the ORC reader. | -| `be/src/format/table/table_format_reader.h` | 71, 106 | OWNED_OUTPUT | Partition/missing columns are filled into the current scanner output block. No local `get_columns()` alias is introduced. | -| `be/src/format/table/table_format_reader.h` | 113 | EXCLUSIVE_BRANCH | Default expression result is mutated only when `use_count()==1`; the shared case is not passed to `assume_mutable()`. | -| `be/src/format/table/es/es_http_reader.cpp` | 153 | OWNED_OUTPUT | ES reader materializes directly into the current output block column slots. | -| `be/src/format/table/iceberg_reader_mixin.h` | 162, 183 | OWNED_OUTPUT | Position delete helper writes synthesized columns into the current delete/output block. The old equality-delete `MutableBlock(&block)` missing-writeback bug has no remaining `assume_mutable()` call and is handled by `to_block()` writeback. | - -| File | Lines | Classification | Evidence / action | -| --- | --- | --- | --- | -| `be/src/storage/schema_change/schema_change.cpp` | 176, 183, 233, 383, 408 | LOCAL_RESULT | New schema-change columns/blocks are created locally before insertion or conversion. | -| `be/src/storage/partial_update_info.cpp` | 1010 | LOCAL_RESULT | `tmp_block` is freshly created for the sequence column before moving it into the target block. | -| `be/src/storage/segment/vertical_segment_writer.cpp` | 887, 890 | LOCAL_RESULT | Encoded default sequence value block is created locally and filled immediately. | -| `be/src/storage/segment/segment_iterator.h` | 269 | OWNED_OUTPUT | Segment iterator writes selected rows into the caller-provided read output block. This path does not create a local `ColumnPtr` alias before mutation; type-cast branch writes through `cast_column()` replacement instead. | -| `be/src/storage/segment/segment_iterator.cpp` | 2647, 2650 | LOCAL_RESULT | `_current_return_columns` is iterator-owned current batch state; converted columns replace that state after `cast_column()`. | -| `be/src/storage/segment/virtual_column_iterator.cpp` | 157 | LOCAL_RESULT | `res_col` is produced by `filter()` and immediately becomes the destination column. | -| `be/src/storage/iterator/vcollect_iterator.cpp` | 422 | LOCAL_RESULT | A clone-empty column is created from the source block before being pushed into the local mutable block. | -| `be/src/storage/iterator/vgeneric_iterators.cpp` | 174 | OWNED_OUTPUT | Merge iterator appends into its destination block for the current read. No local alias is introduced. | -| `be/src/storage/iterator/vertical_merge_iterator.cpp` | 330, 353 | OWNED_OUTPUT | Vertical merge appends into its destination block for the current read. No local alias is introduced. | -| `be/src/storage/iterator/olap_data_convertor.h` | 184 | LOCAL_RESULT | Padding column is the local conversion destination. | -| `be/src/storage/segment/variant/variant_column_writer_impl.cpp` | 1224 | COMMENT_ONLY | Documents avoiding a repeated mutable assertion. | -| `be/src/storage/segment/variant/variant_column_reader.cpp` | 1535 | COMMENT_ONLY | Commented-out code only. | - -## Mutate / MutableBlock Recheck - -Baseline command: - -```bash -rg -n -C 3 'MutableBlock::build_mutable_block\(|MutableBlock\s+\w+\s*\(&' be/src -S -``` - -Current conclusions: - -- `be/src/exprs/aggregate/aggregate_function_sort.h`: aggregate sort state owns - a long-lived `MutableBlock`. `add()` and `merge()` append directly to mutable - columns; `Block` is materialized only for `serialize()` and `sort_block()`. - This is the correct hot-path pattern. -- `be/src/format/table/iceberg_reader_mixin.h`: equality-delete cache merge now - writes back with `eq_file_block = mutable_block.to_block()`. -- `be/src/information_schema/*_scanner.cpp`, scanner helpers, group commit, - partial update, and tablet helper paths write back with `set_columns()`, - `swap(...to_block())`, or equivalent owner replacement. -- `be/src/load/memtable/memtable.cpp`, iterator setup, and hash/set build paths - use local clones or long-lived mutable owners; moved-from local blocks are not - read afterward. - -No second definite `MutableBlock(&block)` missing-writeback bug was found in the -current scan. - -Owner-slot mutate command: - -```bash -rg -n -C 2 'IColumn::mutate\(std::move\([^\n]*(block|Block|_result_block|result_block|out_block|output_block|in_block|tmp_block).*get_by_position' be/src -S -``` - -Current conclusions: - -- Direct helper cases such as `schema_scanner_helper.cpp`, - `schema_scanner.cpp`, skip-bitmap helpers, rowid fetcher, point query, and - file-reader resize paths mutate the owner slot and then write it back with - `replace_by_position()` or direct owner assignment. -- Mem-reuse operator paths that move several columns out at once collect them - into `MutableColumns` and then restore the block with `set_columns()` or - `swap(Block(...))`. -- Long-lived mutable-owner paths such as set-source local state deliberately - move the block columns into state-owned mutable columns; the moved-from block - is not read as the owner afterward in that function. -- No definite `IColumn::mutate(std::move(block->get_by_position(...).column))` - site was found where the mutated owner is later forgotten. - -## Hot-Path Mutate Audit - -Baseline commands: - -```bash -rg -n '\bIColumn::mutate\s*\(|\.mutate\s*\(\)' be/src -S -rg -n '(->|\.)mutate_columns\s*\(' be/src -S -rg -n '\bIColumn::mutate\s*\(|\.mutate\s*\(\)|(->|\.)mutate_columns\s*\(' be/src -S -``` - -Current result: 171 `IColumn::mutate()` / `std::move(*column).mutate()` -matches, plus 47 `Block::mutate_columns()` matches. `be/src/exprs/aggregate` -has no remaining COW `mutate()` matches; aggregate `add()` hot paths therefore -do not mutate per row. `AggregateFunctionSortData` remains the intended model: -keep a long-lived `MutableBlock`, append directly in `add()`, and materialize a -`Block` view only when sorting or serializing. - -The only definite hot-path issue found in this pass was JSONB row-store -deserialization: - -- `JsonbSerializeUtil::jsonb_to_block(char*)` was a single-row helper that - moved/mutated destination block columns per JSONB field. -- It is called from point query and rowid fetcher row loops. -- It now exposes `jsonb_to_columns(...)`, so hot callers mutate the destination - block once into `MutableColumns` outside the row loop and append rows through - those mutable owners. -- The old single-row `jsonb_to_block(char*)` wrapper remains for non-hot callers, - but current hot call sites use `jsonb_to_columns(...)`. - -`be/src/format/json/new_json_reader.cpp` also keeps Hive duplicate-key rollback -on an owner-slot mutate/writeback helper. This branch only executes after the -same key has already been written once in the current JSON object, so it is not -part of the normal per-column write path, and it avoids relying on a scanner -output-block ownership assumption across a rollback/rewrite operation. - -Detailed grouped audit: - -| File | Lines | Hot-path conclusion | -| --- | --- | --- | -| `be/src/core/block/block.cpp` | 584, 1108 | Block-level helper. `mutate_columns()` is the API boundary; shrink path mutates only after exclusivity branch. Not row-by-row. | -| `be/src/core/block/block.h` | 392, 396, 529 | `MutableBlock` constructors and const-column recursion helper. Block-level ownership transfer, not row-by-row. | -| `be/src/core/block/column_with_type_and_name.cpp` | 131 | Column wrapper conversion helper. Not row-by-row. | -| `be/src/core/column/column.h` | 191, 586, 596 | Column COW helper implementations. API boundary. | -| `be/src/core/column/column_const.cpp` | 113 | Const-column internal mutation of owned data. Not row-by-row. | -| `be/src/core/column/column_const.h` | 298, 326 | Const-column internal materialization helpers. Not row-by-row. | -| `be/src/core/column/column_map.cpp` | 522, 523, 526, 529, 539, 540, 541, 560, 563, 566, 662, 665 | Map internal filter helpers mutate subcolumns once per column operation. Recursive value-map dedup detaches the value owner before mutation. Not row-by-row. | -| `be/src/core/column/column_map.h` | 65, 66 | Map shared-column factory preserves immutable subcolumns and validates through const access. Not row-by-row. | -| `be/src/core/column/column_nullable.cpp` | 118 | Nullable constructor internal subcolumn ownership. Not row-by-row. | -| `be/src/core/column/column_nullable.h` | 68, 70 | Nullable shared-column factory preserves immutable subcolumns and validates through const access. Not row-by-row. | -| `be/src/core/column/column_variant.cpp` | 319, 487, 495, 502, 504, 2071, 2129, 2348, 2356, 2816, 2836, 2837 | Variant internal finalize/filter/serialization helpers mutate subcolumns during a column operation. Not row-by-row COW. | -| `be/src/core/column/column_variant.h` | 328, 444 | Variant helper/finalize path. Not row-by-row. | -| `be/src/core/cow.h` | 71 | COW primitive example/comment path. API boundary. | -| `be/src/core/data_type/data_type_array.cpp` | 123 | Array type helper mutates nested column once. Not row-by-row. | -| `be/src/core/data_type/data_type_map.cpp` | 138, 139 | Map type helper mutates key/value nested columns once. Not row-by-row. | -| `be/src/core/data_type/data_type_struct.cpp` | 217 | Struct type helper mutates children once per column operation. Not row-by-row. | -| `be/src/exprs/function/array/function_array_utils.cpp` | 64 | Function execution block-level variant/nullable conversion. Not row-by-row. | -| `be/src/exprs/function/cast/cast_to_variant.h` | 41, 44, 170 | Cast execution mutates result/source once per vectorized block. Not aggregate-row hot. | -| `be/src/exprs/function/comparison_equal_for_null.cpp` | 194, 233 | Temporary block result extraction. Not row-by-row. | -| `be/src/exprs/function/function.cpp` | 70 | Function null-map merge mutates result null-map once per execute. Not row-by-row. | -| `be/src/exprs/function/function_bitmap.cpp` | 684 | Bitmap function mutates a local/source column once per vectorized execute. Not row-by-row. | -| `be/src/exprs/function/function_variant_element.cpp` | 325 | Variant element execution creates mutable root once. Not row-by-row. | -| `be/src/exprs/function/if.cpp` | 252, 283 | IF function reuses one selected result column per vectorized execute. Not row-by-row. | -| `be/src/exprs/table_function/udf_table_function.cpp` | 127 | UDF table-function result column ownership transfer once per produced block. Not row-by-row. | -| `be/src/exprs/table_function/vexplode.cpp` | 48 | Table-function variant column conversion once per input column. Not row-by-row. | -| `be/src/exprs/table_function/vexplode_v2.cpp` | 54 | Same as `vexplode.cpp`. | -| `be/src/exprs/vcompound_pred.h` | 212, 233, 237 | Compound predicate reuses one input/result column per vectorized execute. Not row-by-row. | -| `be/src/exprs/vcondition_expr.cpp` | 206, 235 | CASE/condition expression reuses selected result column per vectorized execute. Not row-by-row. | -| `be/src/exec/common/arrow_column_to_doris_column.cpp` | 103 | Arrow conversion mutates the destination column once per converted column. Not row-by-row. | -| `be/src/exec/common/data_gen_functions/vnumbers_tvf.cpp` | 52 | Data-gen source mutates output columns once per output block when memory is reused. Not row-by-row. | -| `be/src/exec/common/partition_sort_utils.cpp` | 32 | Partition-sort utility converts one stored block to mutable columns before appending. Not row-by-row. | -| `be/src/exec/common/variant_util.cpp` | 438, 2157, 2219, 2222 | Variant utility column conversion/finalization paths. Not row-by-row COW. | -| `be/src/exec/exchange/vdata_stream_sender.cpp` | 332 | Exchange sender keeps a mutable block owner while serializing/sending one block. Not row-by-row. | -| `be/src/exec/operator/aggregation_sink_operator.cpp` | 311, 500 | Aggregation sink normalizes key float values once per input block/key column. Not aggregate `add()`. | -| `be/src/exec/operator/aggregation_source_operator.cpp` | 116, 141, 246, 304, 313 | Aggregation source output path mutates reused output columns once per output batch. Not row-by-row. | -| `be/src/exec/operator/bucketed_aggregation_sink_operator.cpp` | 179 | Bucketed aggregation sink normalizes key float values once per input block/key column. Not aggregate `add()`. | -| `be/src/exec/operator/bucketed_aggregation_source_operator.cpp` | 332, 387, 475, 558 | Bucketed aggregation source mutates reused output columns once per output batch. Not row-by-row. | -| `be/src/exec/operator/dict_sink_operator.cpp` | 47 | Sink block column overflow conversion once per column. Not row-by-row. | -| `be/src/exec/operator/distinct_streaming_aggregation_operator.cpp` | 166, 220, 225, 232 | Distinct streaming aggregation mutates expression/output/cache columns once per processed block/split. Not row-by-row. | -| `be/src/exec/operator/exchange_sink_operator.cpp` | 513 | Exchange sink transfers current block into mutable block for serialization. Not row-by-row. | -| `be/src/exec/operator/hashjoin_build_sink.cpp` | 575, 577 | Hash-join build converts/finalizes one block column before build. Not row-by-row. | -| `be/src/exec/operator/join/process_hash_table_probe_impl.h` | 168, 657, 726 | Hash-join probe mutates output/lazy materialized columns once per output block. Not row-by-row. | -| `be/src/exec/operator/nested_loop_join_probe_operator.cpp` | 82, 104, 145, 401, 515 | Nested-loop join probe creates mutable output columns once per output block/batch section. Not row-by-row. | -| `be/src/exec/operator/partitioned_aggregation_sink_operator.cpp` | 515, 516 | Partitioned aggregation state owns key/value mutable blocks for later appends. Correct long-lived owner pattern. | -| `be/src/exec/operator/schema_scan_operator.cpp` | 261 | Schema scan copies source columns into output columns once per block. Low-volume metadata path. | -| `be/src/exec/operator/set_sink_operator.cpp` | 134 | Set sink overflow conversion once per block column. Not row-by-row. | -| `be/src/exec/operator/set_source_operator.cpp` | 117 | Set source transfers output block columns into local mutable columns once per block. Not row-by-row. | -| `be/src/exec/operator/streaming_aggregation_operator.cpp` | 334, 376, 405, 472, 496, 599 | Streaming aggregation source/sink output paths mutate reused block columns once per block/output batch. Not aggregate `add()`. | -| `be/src/exec/rowid_fetcher.cpp` | 167, 1082 | Fixed in this pass: row-store JSONB loops now mutate result columns once outside the row loop and append through `jsonb_to_columns(...)`. | -| `be/src/exec/rowid_fetcher.cpp` | 196, 943, 1103 | Non-row-store merge/read paths mutate once per destination column, then append many rows. Not row-by-row. | -| `be/src/exec/scan/file_scanner.cpp` | 441, 785 | File scanner mutates partition-prune/skip-bitmap helper columns once per block. Not row-by-row. | -| `be/src/exec/scan/meta_scanner.cpp` | 115 | Meta scanner output columns once per block. Low-volume metadata path. | -| `be/src/exec/scan/scanner.cpp` | 219 | Scanner materialization mutates a prepared column pointer once per projection column. Not row-by-row. | -| `be/src/exec/sink/vtablet_block_convertor.cpp` | 285, 289 | Tablet sink conversion mutates temporary/result columns once per block. Not row-by-row. | -| `be/src/exec/sink/writer/vtablet_writer.cpp` | 1765 | Restores a temporary block after merge. Not row-by-row. | -| `be/src/exec/sink/writer/vtablet_writer_v2.cpp` | 625 | Same as v1 writer. | -| `be/src/format/arrow/arrow_stream_reader.cpp` | 97 | Arrow stream reader mutates output block once per batch. Not row-by-row. | -| `be/src/format/count_reader.h` | 61 | Count reader creates/mutates output columns once per batch. Not row-by-row. | -| `be/src/format/csv/csv_reader.cpp` | 446, 452 | CSV reader mutates output columns once per batch when filling rows. Not row-by-row. | -| `be/src/format/jni/jni_data_bridge.cpp` | 108 | JNI bridge mutates one destination column before writing a batch. Not row-by-row. | -| `be/src/format/json/new_json_reader.cpp` | 462, 472, 482 | Malformed-row append, row rollback, and Hive duplicate-key rollback use owner-slot mutate/writeback helpers. The duplicate-key helper is a rare rollback path, not the normal per-field write path. | -| `be/src/format/lance/lance_rust_reader.cpp` | 233 | Lance reader mutates output block once per batch. Not row-by-row. | -| `be/src/format/orc/vorc_reader.cpp` | 2055, 2073, 2145, 2216, 2218, 2253, 2263, 2860 | ORC complex/schema conversion and block resize paths mutate once per column/batch. Not row-by-row. | -| `be/src/format/parquet/parquet_column_convert.h` | 198, 238, 346 | Parquet conversion helpers mutate destination handles once per converted column. Not row-by-row. | -| `be/src/format/parquet/vparquet_column_reader.cpp` | 331, 413, 663, 719, 796, 994 | Parquet column reader mutates destination handles once per column chunk/batch. Not row-by-row. | -| `be/src/format/parquet/vparquet_column_reader.h` | 485 | Nested parquet column reader mutates destination once per read call. Not row-by-row. | -| `be/src/format/parquet/vparquet_group_reader.cpp` | 668 | Parquet group reader temporary resize once per block. Not row-by-row. | -| `be/src/format/table/paimon_cpp_reader.cpp` | 77, 120 | Paimon reader mutates output columns once per batch. Not row-by-row. | -| `be/src/format/table/paimon_jni_reader.cpp` | 108 | Paimon JNI reader mutates output columns once per batch. Not row-by-row. | -| `be/src/format/table/parquet_metadata_reader.cpp` | 815 | Metadata reader output reuse once per batch. Low-volume metadata path. | -| `be/src/format/table/remote_doris_reader.cpp` | 75 | Remote Doris reader mutates output columns once per batch. Not row-by-row. | -| `be/src/format/transformer/merge_partitioner.cpp` | 213 | Merge partitioner mutates a block once before partitioning. Not row-by-row. | -| `be/src/information_schema/schema_scanner.cpp` | 104, 314, 473 | Information-schema insertion helpers can run per cell, but this is a metadata path, not BE data hot path. They write back correctly. | -| `be/src/information_schema/schema_scanner_helper.cpp` | 36, 47, 59, 75, 85, 95, 105 | Same metadata-path per-cell helper pattern as `schema_scanner.cpp`; writeback is present. | -| `be/src/runtime/result_block_buffer.cpp` | 217 | Result buffer merges/mutates one block when appending query results. Not row-by-row. | -| `be/src/service/point_query_executor.cpp` | 503 | Fixed in this pass: point query now mutates result columns once outside the row loop and appends row-store/missing-column values through those mutable owners. | -| `be/src/storage/iterator/block_reader.cpp` | 171, 347, 480, 537, 587 | Storage reader prepares target/delete-filter columns per batch. Not row-by-row. | -| `be/src/storage/iterator/olap_data_convertor.h` | 310, 314 | OLAP convertor captures column data once per conversion batch. Not row-by-row. | -| `be/src/storage/iterator/vcollect_iterator.cpp` | 881 | Collect iterator mutates target columns once per batch. Not row-by-row. | -| `be/src/storage/iterator/vertical_block_reader.cpp` | 190, 401, 487, 488, 555 | Vertical reader prepares target/delete-filter columns per batch. Not row-by-row. | -| `be/src/storage/iterator/vgeneric_iterators.cpp` | 67 | Generic iterator mutates output columns once per batch. Not row-by-row. | -| `be/src/storage/partial_update_info.cpp` | 45, 342, 389, 418, 497, 565 | Partial-update block construction/merge mutates columns once per block. Not row-by-row. | -| `be/src/storage/segment/column_reader.cpp` | 997, 1012, 1013, 1084, 1169, 1170, 1417, 1782, 1794 | Segment complex-column readers mutate offsets/items/subcolumns once per read batch. Not row-by-row. | -| `be/src/storage/segment/segment_iterator.cpp` | 2185, 2907 | Segment iterator mutates current return columns / temporary mock column once per batch. Not row-by-row. | -| `be/src/storage/segment/variant/hierarchical_data_iterator.cpp` | 206, 228, 249, 290, 546 | Variant hierarchical reader mutates subcolumns once per read/finalize batch. Not row-by-row. | -| `be/src/storage/segment/variant/hierarchical_data_iterator.h` | 141 | Variant iterator helper mutates destination once per helper call. Not row-by-row. | -| `be/src/storage/segment/variant/variant_column_writer_impl.cpp` | 1229 | Variant writer finalization helper. Not row-by-row. | -| `be/src/storage/segment/variant/variant_streaming_compaction_writer.cpp` | 146 | Variant streaming compaction finalization helper. Not row-by-row. | -| `be/src/storage/segment/vertical_segment_writer.cpp` | 96 | Skip-bitmap helper mutates one block column then writes back. Not row-by-row. | -| `be/src/storage/tablet/base_tablet.cpp` | 962, 986, 1194 | Tablet row reconstruction/partial update mutates full output columns once per block. Not row-by-row. | -| `be/src/storage/tablet_info.cpp` | 563 | Partition-key helper mutates a temporary column once. Not row-by-row. | -| `be/src/util/jsonb/serialize.cpp` | 83, 159 | Wrapper paths now mutate destination columns once, call `jsonb_to_columns(...)`, and restore with `set_columns()`. Known hot callers bypass the single-row wrapper and hold `MutableColumns` across the row loop. | - -## Tests Added For Real COW Violations - -- `NewJsonReaderCowTest.AppendNullForMalformedJsonMutatesOwnerColumn` builds a - nullable column with an extra `ColumnPtr` alias, calls the malformed-json - helper, and verifies the block receives a new mutated owner while the original - shared column remains unchanged. -- `NewJsonReaderCowTest.TruncateBlockToRowsMutatesOwnerColumn` builds a shared - two-row nullable column, truncates the block, and verifies the original alias - still has two rows. -- `NewJsonReaderCowTest.PopBackLastInsertedValueMutatesOwnerColumn` builds a - shared destination column, removes the last inserted value through the JSON - rollback helper, and verifies the block gets a mutated owner while the - original alias still has both rows. -- `BlockSerializeCowTest.JsonbToBlockMutatesDestinationOwnerColumn` builds a - shared destination column, decodes JSONB rows, and verifies the destination - block gets its own mutated owner while the original shared column remains - empty. - -These tests cover the exact alias mode that external JSON regression exposed: -mutating a block-owned column while another `ColumnPtr` reference to the same -column still exists. - -## Fixed During This Audit Series - -- `AggregateFunctionSortData::{add,merge}`: removed hot-path - `assume_mutable()` calls by making the aggregate state own a `MutableBlock`. -- `AggregateFunctionNullUnary::streaming_agg_serialize_to_column`: replaced a - read-only source nested-column mutable assertion with const access. -- ORC schema-change nullable converter: `align_orc_null_map` now copies from the - appended source null-map slice instead of offset `0`. -- Parquet schema-change nullable converter: logical-source null maps with an old - destination prefix now copy from the appended logical-source slice. -- `ColumnArray::create(ColumnPtr...)`: shared-column construction keeps immutable - subcolumns shared but now reuses the same const-safe offset type and - nested-size validation as the mutable constructor. -- `ColumnNullable::create(ColumnPtr...)` and `ColumnMap::create(ColumnPtr...)`: - shared-column construction no longer deep-mutates input subcolumns, avoiding - unnecessary clones in block wrapping paths while keeping invariant checks. -- `ColumnMap::deduplicate_keys(true)`: recursive nested-map value dedup now - detaches and writes back the value owner instead of const-casting through a - shared nullable/value subcolumn. -- `ColumnMap::filter(const Filter&, ...)` and `ColumnMap::permute(...)`: these - return new columns and therefore keep input subcolumns shared/const instead of - pre-cloning whole key/value/offset columns. -- Variant materialization for nullable scalar variants: the nested variant is - taken from the already-detached nullable owner, so root finalization/conversion - cannot mutate aliases of the original nullable wrapper. -- JSON malformed/rollback paths: changed from `get_columns()` plus - `assume_mutable()` to owner-slot mutate/writeback helpers with focused BE UTs. -- Rowid fetcher merge and external-row readback: changed shared-output mutation - and `get_columns()`/`const_cast` destination writes to owner-slot - mutate/writeback. -- JSONB row-store decode: changed point-query and rowid-fetcher row loops from - per-row/per-field block mutation to a `MutableColumns` owner held across the - loop, with a COW unit test for shared destination columns. From 49f794755f356fc15194c35660d404af461e5768 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Wed, 20 May 2026 22:08:09 +0800 Subject: [PATCH 08/11] [fix](be) Restore scoped COW mutation after master rebase ### What problem does this PR solve? Issue Number: close #xxx Related PR: #63001 Problem Summary: After rebasing the COW branch onto latest master, several newly introduced or newly exposed code paths still called Block::mutate_columns() on live Block objects. The COW API now intentionally makes mutate_columns() an rvalue-only stealing operation, so live blocks must use scoped owner APIs that restore columns on every exit path. This updates nested-loop join lazy materialization, row binlog block filling, historical row retrieval, and affected tests to use mutate_columns_scoped() when the Block remains live. It also updates the block overflow tests to use the current string overflow debug point instead of removed test helpers/configuration. ### Release note None ### Check List (For Author) - Test: Unit Test - ./build.sh --be - ./run-be-ut.sh - Behavior changed: No - Does this need documentation: No --- .../nested_loop_join_probe_operator.cpp | 109 ++++++++++-------- .../segment/historical_row_retriever.cpp | 39 ++++--- .../segment/row_binlog_segment_writer.cpp | 92 ++++++++------- be/test/core/block/block_test.cpp | 41 +++++-- .../memtable/memtable_flush_executor_test.cpp | 17 +-- .../olap/rowset/group_rowset_writer_test.cpp | 13 ++- 6 files changed, 174 insertions(+), 137 deletions(-) diff --git a/be/src/exec/operator/nested_loop_join_probe_operator.cpp b/be/src/exec/operator/nested_loop_join_probe_operator.cpp index 90d580e2234733..c0203a74f6f186 100644 --- a/be/src/exec/operator/nested_loop_join_probe_operator.cpp +++ b/be/src/exec/operator/nested_loop_join_probe_operator.cpp @@ -261,31 +261,33 @@ Status NestedLoopJoinProbeLocalState::_append_lazy_rows(const IColumn::Filter& f const size_t old_rows = _join_block.rows(); const size_t new_rows = old_rows + selected_rows; - auto dst_columns = _join_block.mutate_columns(); - for (int column_id : p._materialize_column_ids) { - const auto column_idx = cast_set(column_id); - if (column_idx < p._num_probe_side_columns) { - const auto& src_column = probe_block.get_by_position(column_idx); - if (fixed_side_probe) { - append_many_from_source(dst_columns[column_idx], src_column, fixed_side_pos, - selected_rows); - } else { - append_filtered_from_source(dst_columns[column_idx], src_column, filter, - selected_rows); - } - } else { - const auto build_column_idx = column_idx - p._num_probe_side_columns; - const auto& src_column = build_block.get_by_position(build_column_idx); - if (fixed_side_probe) { - append_filtered_from_source(dst_columns[column_idx], src_column, filter, + { + auto dst_columns_guard = _join_block.mutate_columns_scoped(); + auto& dst_columns = dst_columns_guard.mutable_columns(); + for (int column_id : p._materialize_column_ids) { + const auto column_idx = cast_set(column_id); + if (column_idx < p._num_probe_side_columns) { + const auto& src_column = probe_block.get_by_position(column_idx); + if (fixed_side_probe) { + append_many_from_source(dst_columns[column_idx], src_column, fixed_side_pos, selected_rows); + } else { + append_filtered_from_source(dst_columns[column_idx], src_column, filter, + selected_rows); + } } else { - append_many_from_source(dst_columns[column_idx], src_column, fixed_side_pos, - selected_rows); + const auto build_column_idx = column_idx - p._num_probe_side_columns; + const auto& src_column = build_block.get_by_position(build_column_idx); + if (fixed_side_probe) { + append_filtered_from_source(dst_columns[column_idx], src_column, filter, + selected_rows); + } else { + append_many_from_source(dst_columns[column_idx], src_column, fixed_side_pos, + selected_rows); + } } } } - _join_block.set_columns(std::move(dst_columns)); _replace_lazy_placeholder_columns(new_rows); DCHECK_EQ(_join_block.rows(), new_rows); return Status::OK(); @@ -296,17 +298,19 @@ Status NestedLoopJoinProbeLocalState::_append_lazy_probe_row_with_build_defaults auto& p = _parent->cast(); const size_t new_rows = _join_block.rows() + 1; - auto dst_columns = _join_block.mutate_columns(); - for (int column_id : p._materialize_column_ids) { - const auto column_idx = cast_set(column_id); - if (column_idx < p._num_probe_side_columns) { - const auto& src_column = probe_block.get_by_position(column_idx); - append_many_from_source(dst_columns[column_idx], src_column, probe_row_pos, 1); - } else { - dst_columns[column_idx]->insert_many_defaults(1); + { + auto dst_columns_guard = _join_block.mutate_columns_scoped(); + auto& dst_columns = dst_columns_guard.mutable_columns(); + for (int column_id : p._materialize_column_ids) { + const auto column_idx = cast_set(column_id); + if (column_idx < p._num_probe_side_columns) { + const auto& src_column = probe_block.get_by_position(column_idx); + append_many_from_source(dst_columns[column_idx], src_column, probe_row_pos, 1); + } else { + dst_columns[column_idx]->insert_many_defaults(1); + } } } - _join_block.set_columns(std::move(dst_columns)); _replace_lazy_placeholder_columns(new_rows); DCHECK_EQ(_join_block.rows(), new_rows); return Status::OK(); @@ -318,19 +322,21 @@ Status NestedLoopJoinProbeLocalState::_append_lazy_mark_probe_row_with_build_def const size_t mark_column_id = p._num_probe_side_columns + p._num_build_side_columns; const size_t new_rows = _join_block.rows() + 1; - auto dst_columns = _join_block.mutate_columns(); - for (int column_id : p._materialize_column_ids) { - const auto column_idx = cast_set(column_id); - if (column_idx < p._num_probe_side_columns) { - const auto& src_column = probe_block.get_by_position(column_idx); - append_many_from_source(dst_columns[column_idx], src_column, probe_row_pos, 1); - } else if (column_idx == mark_column_id) { - append_mark_value(dst_columns[column_idx], mark_value); - } else { - dst_columns[column_idx]->insert_many_defaults(1); + { + auto dst_columns_guard = _join_block.mutate_columns_scoped(); + auto& dst_columns = dst_columns_guard.mutable_columns(); + for (int column_id : p._materialize_column_ids) { + const auto column_idx = cast_set(column_id); + if (column_idx < p._num_probe_side_columns) { + const auto& src_column = probe_block.get_by_position(column_idx); + append_many_from_source(dst_columns[column_idx], src_column, probe_row_pos, 1); + } else if (column_idx == mark_column_id) { + append_mark_value(dst_columns[column_idx], mark_value); + } else { + dst_columns[column_idx]->insert_many_defaults(1); + } } } - _join_block.set_columns(std::move(dst_columns)); _replace_lazy_placeholder_columns(new_rows); DCHECK_EQ(_join_block.rows(), new_rows); return Status::OK(); @@ -341,18 +347,21 @@ Status NestedLoopJoinProbeLocalState::_append_lazy_build_rows_with_probe_default auto& p = _parent->cast(); const size_t new_rows = _join_block.rows() + selected_rows; - auto dst_columns = _join_block.mutate_columns(); - for (int column_id : p._materialize_column_ids) { - const auto column_idx = cast_set(column_id); - if (column_idx < p._num_probe_side_columns) { - dst_columns[column_idx]->insert_many_defaults(selected_rows); - } else { - const auto build_column_idx = column_idx - p._num_probe_side_columns; - const auto& src_column = build_block.get_by_position(build_column_idx); - append_filtered_from_source(dst_columns[column_idx], src_column, filter, selected_rows); + { + auto dst_columns_guard = _join_block.mutate_columns_scoped(); + auto& dst_columns = dst_columns_guard.mutable_columns(); + for (int column_id : p._materialize_column_ids) { + const auto column_idx = cast_set(column_id); + if (column_idx < p._num_probe_side_columns) { + dst_columns[column_idx]->insert_many_defaults(selected_rows); + } else { + const auto build_column_idx = column_idx - p._num_probe_side_columns; + const auto& src_column = build_block.get_by_position(build_column_idx); + append_filtered_from_source(dst_columns[column_idx], src_column, filter, + selected_rows); + } } } - _join_block.set_columns(std::move(dst_columns)); _replace_lazy_placeholder_columns(new_rows); DCHECK_EQ(_join_block.rows(), new_rows); return Status::OK(); diff --git a/be/src/storage/segment/historical_row_retriever.cpp b/be/src/storage/segment/historical_row_retriever.cpp index 45ed91b281c9e9..ec8c8f9e466f34 100644 --- a/be/src/storage/segment/historical_row_retriever.cpp +++ b/be/src/storage/segment/historical_row_retriever.cpp @@ -198,29 +198,30 @@ Status PrimaryKeyModelRowRetriever::build_before_block(Block* before_block, old_value_block, &read_index, false, nullptr)); - auto mutable_before_columns = before_block->mutate_columns(); - // Fill each row in before_block. - for (uint32_t idx = 0; idx < num_rows; ++idx) { - auto it = read_index.find(idx); - if (it == read_index.end()) { - // No historical row, fill BEFORE with NULL. - for (size_t i = 0; i < value_cids.size(); ++i) { - auto* nullable_column = - assert_cast(mutable_before_columns[i].get()); - nullable_column->insert_many_defaults(1); + { + auto mutable_before_columns_guard = before_block->mutate_columns_scoped(); + auto& mutable_before_columns = mutable_before_columns_guard.mutable_columns(); + // Fill each row in before_block. + for (uint32_t idx = 0; idx < num_rows; ++idx) { + auto it = read_index.find(idx); + if (it == read_index.end()) { + // No historical row, fill BEFORE with NULL. + for (size_t i = 0; i < value_cids.size(); ++i) { + auto* nullable_column = + assert_cast(mutable_before_columns[i].get()); + nullable_column->insert_many_defaults(1); + } + continue; } - continue; - } - uint32_t pos_in_old_block = it->second; - for (size_t i = 0; i < value_cids.size(); ++i) { - insert_value_to_nullable_column(mutable_before_columns[i].get(), - *old_value_block.get_by_position(i).column, - pos_in_old_block); + uint32_t pos_in_old_block = it->second; + for (size_t i = 0; i < value_cids.size(); ++i) { + insert_value_to_nullable_column(mutable_before_columns[i].get(), + *old_value_block.get_by_position(i).column, + pos_in_old_block); + } } } - - before_block->set_columns(std::move(mutable_before_columns)); return Status::OK(); } diff --git a/be/src/storage/segment/row_binlog_segment_writer.cpp b/be/src/storage/segment/row_binlog_segment_writer.cpp index 11dafa3a78a4b2..ccfc53d9fe9223 100644 --- a/be/src/storage/segment/row_binlog_segment_writer.cpp +++ b/be/src/storage/segment/row_binlog_segment_writer.cpp @@ -297,50 +297,55 @@ Status RowBinlogSegmentWriter::_fill_binlog_columns(size_t num_rows, std::vector binlog_cids = {_binlog_col_start_id, _binlog_col_start_id + 1, _binlog_col_start_id + 2}; Block binlog_prefix_block = _tablet_schema->create_block_by_cids(binlog_cids); - MutableColumns binlog_prefix_columns = binlog_prefix_block.mutate_columns(); - // we can't get correct lsn number before commit, because we can't get the version before commit, - // but we can fill auto-inc lsn to ensure the order first, then fill version when read single rowset. - IColumn* lsn_col_ptr = binlog_prefix_columns[0].get(); - CHECK(_lsn_ids->size() >= num_rows) << _lsn_ids->size() << " vs " << num_rows; - for (int i = 0; i < num_rows; i++) { - assert_cast(lsn_col_ptr) - ->insert_value(static_cast(_lsn_ids->at(i))); - } - - // wrong op only happens when partial-update, it will be fixed by delete bitmap when publish - const FieldType op_col_type = _tablet_schema->column(binlog_cids[1]).type(); - IColumn* op_col_ptr = binlog_prefix_columns[1].get(); - auto* op_nullable_column = typeid_cast(op_col_ptr); - IColumn* op_nested_column = - op_nullable_column != nullptr ? &op_nullable_column->get_nested_column() : op_col_ptr; - - CHECK(op_types.size() >= num_rows) << op_types.size() << " vs " << num_rows; - CHECK(op_col_type == FieldType::OLAP_FIELD_TYPE_BIGINT) - << "row binlog op column type must be BIGINT, actual=" << static_cast(op_col_type); - auto* op_int64_column = assert_cast(op_nested_column); - for (int i = 0; i < num_rows; i++) { - op_int64_column->insert_value(op_types[i]); - } - - // we can't get correct timestamp when commit - IColumn* ts_col_ptr = binlog_prefix_columns[2].get(); - auto timestamp = UnixMillis(); - auto* ts_nullable_column = typeid_cast(ts_col_ptr); - if (ts_nullable_column != nullptr) { - assert_cast(&ts_nullable_column->get_nested_column()) - ->insert_many_vals(timestamp, num_rows); - } else { - assert_cast(ts_col_ptr)->insert_many_vals(timestamp, num_rows); - } + { + auto binlog_prefix_columns_guard = binlog_prefix_block.mutate_columns_scoped(); + auto& binlog_prefix_columns = binlog_prefix_columns_guard.mutable_columns(); + // we can't get correct lsn number before commit, because we can't get the version before commit, + // but we can fill auto-inc lsn to ensure the order first, then fill version when read single rowset. + IColumn* lsn_col_ptr = binlog_prefix_columns[0].get(); + CHECK(_lsn_ids->size() >= num_rows) << _lsn_ids->size() << " vs " << num_rows; + for (int i = 0; i < num_rows; i++) { + assert_cast(lsn_col_ptr) + ->insert_value(static_cast(_lsn_ids->at(i))); + } - // finally update null map - for (int i = 0; i < num_rows; i++) { - //lsn_column->get_null_map_data().emplace_back(0); - if (op_nullable_column != nullptr) { - op_nullable_column->get_null_map_data().emplace_back(0); + // wrong op only happens when partial-update, it will be fixed by delete bitmap when publish + const FieldType op_col_type = _tablet_schema->column(binlog_cids[1]).type(); + IColumn* op_col_ptr = binlog_prefix_columns[1].get(); + auto* op_nullable_column = typeid_cast(op_col_ptr); + IColumn* op_nested_column = op_nullable_column != nullptr + ? &op_nullable_column->get_nested_column() + : op_col_ptr; + + CHECK(op_types.size() >= num_rows) << op_types.size() << " vs " << num_rows; + CHECK(op_col_type == FieldType::OLAP_FIELD_TYPE_BIGINT) + << "row binlog op column type must be BIGINT, actual=" + << static_cast(op_col_type); + auto* op_int64_column = assert_cast(op_nested_column); + for (int i = 0; i < num_rows; i++) { + op_int64_column->insert_value(op_types[i]); } + + // we can't get correct timestamp when commit + IColumn* ts_col_ptr = binlog_prefix_columns[2].get(); + auto timestamp = UnixMillis(); + auto* ts_nullable_column = typeid_cast(ts_col_ptr); if (ts_nullable_column != nullptr) { - ts_nullable_column->get_null_map_data().emplace_back(0); + assert_cast(&ts_nullable_column->get_nested_column()) + ->insert_many_vals(timestamp, num_rows); + } else { + assert_cast(ts_col_ptr)->insert_many_vals(timestamp, num_rows); + } + + // finally update null map + for (int i = 0; i < num_rows; i++) { + //lsn_column->get_null_map_data().emplace_back(0); + if (op_nullable_column != nullptr) { + op_nullable_column->get_null_map_data().emplace_back(0); + } + if (ts_nullable_column != nullptr) { + ts_nullable_column->get_null_map_data().emplace_back(0); + } } } @@ -389,13 +394,12 @@ Status RowBinlogSegmentWriter::_fill_before_columns(size_t num_rows) { // Compatibility path: only fill empty BEFORE values. if (_fill_empty_before_value) { - MutableColumns before_mutable_columns = before_block.mutate_columns(); - for (auto& before_mutable_column : before_mutable_columns) { + auto before_mutable_columns_guard = before_block.mutate_columns_scoped(); + for (auto& before_mutable_column : before_mutable_columns_guard.mutable_columns()) { auto* before_nullable_column = reinterpret_cast(before_mutable_column.get()); before_nullable_column->insert_many_defaults(num_rows); } - before_block.set_columns(std::move(before_mutable_columns)); } else { DCHECK(_historical_data_writer != nullptr); diff --git a/be/test/core/block/block_test.cpp b/be/test/core/block/block_test.cpp index e02e4ac43cd9ab..22b9916c0ced9a 100644 --- a/be/test/core/block/block_test.cpp +++ b/be/test/core/block/block_test.cpp @@ -66,11 +66,16 @@ #include "runtime/descriptor_helper.h" #include "runtime/descriptors.h" #include "testutil/column_helper.h" +#include "util/debug_points.h" +#include "util/defer_op.h" namespace doris { namespace { +static constexpr auto CONVERT_COLUMN_IF_OVERFLOW_DEBUG_POINT = + "ColumnStr.convert_column_if_overflow.max_string_size"; + class ThrowOnCloneColumn final : public COWHelper { private: friend class COWHelper; @@ -999,12 +1004,18 @@ TEST(BlockTest, clear_blocks) { } TEST(BlockTest, merge_returns_error_when_checked_string_append_exceeds_limit) { - auto input_block = create_string_block({"abcde", "fghij"}); - auto output_block = create_string_block({}); - - auto string_overflow_size = config::string_overflow_size; - config::string_overflow_size = 9; - Defer defer([string_overflow_size]() { config::string_overflow_size = string_overflow_size; }); + auto input_block = + ColumnHelper::create_block(std::vector {"abcde", "fghij"}); + auto output_block = ColumnHelper::create_block(std::vector {}); + + const auto origin_enable_debug_points = config::enable_debug_points; + config::enable_debug_points = true; + DebugPoints::instance()->add_with_params(CONVERT_COLUMN_IF_OVERFLOW_DEBUG_POINT, + {{"max_string_size", "9"}}); + Defer defer([origin_enable_debug_points]() { + DebugPoints::instance()->remove(CONVERT_COLUMN_IF_OVERFLOW_DEBUG_POINT); + config::enable_debug_points = origin_enable_debug_points; + }); auto status = [&]() { ScopedMutableBlock scoped_mutable_block(&output_block); @@ -1019,12 +1030,18 @@ TEST(BlockTest, merge_returns_error_when_checked_string_append_exceeds_limit) { } TEST(BlockTest, merge_ignore_overflow_keeps_owned_accumulation_convertible) { - auto input_block = create_string_block({"abcde", "fghij"}); - auto output_block = create_string_block({}); - - auto string_overflow_size = config::string_overflow_size; - config::string_overflow_size = 9; - Defer defer([string_overflow_size]() { config::string_overflow_size = string_overflow_size; }); + auto input_block = + ColumnHelper::create_block(std::vector {"abcde", "fghij"}); + auto output_block = ColumnHelper::create_block(std::vector {}); + + const auto origin_enable_debug_points = config::enable_debug_points; + config::enable_debug_points = true; + DebugPoints::instance()->add_with_params(CONVERT_COLUMN_IF_OVERFLOW_DEBUG_POINT, + {{"max_string_size", "9"}}); + Defer defer([origin_enable_debug_points]() { + DebugPoints::instance()->remove(CONVERT_COLUMN_IF_OVERFLOW_DEBUG_POINT); + config::enable_debug_points = origin_enable_debug_points; + }); ColumnPtr converted_column; { diff --git a/be/test/load/memtable/memtable_flush_executor_test.cpp b/be/test/load/memtable/memtable_flush_executor_test.cpp index 7916d94db409c1..375d0daab40ec0 100644 --- a/be/test/load/memtable/memtable_flush_executor_test.cpp +++ b/be/test/load/memtable/memtable_flush_executor_test.cpp @@ -253,13 +253,16 @@ class MemTableFlushExecutorGroupFlushTest : public testing::Test { block.insert(ColumnWithTypeAndName(slot->get_empty_mutable_column(), slot->type(), slot->col_name())); } - auto cols = block.mutate_columns(); - int8_t k1 = -127; - int16_t k2 = -32767; - int32_t k3 = -2147483647; - cols[0]->insert_data((const char*)&k1, sizeof(k1)); - cols[1]->insert_data((const char*)&k2, sizeof(k2)); - cols[2]->insert_data((const char*)&k3, sizeof(k3)); + { + auto cols_guard = block.mutate_columns_scoped(); + auto& cols = cols_guard.mutable_columns(); + int8_t k1 = -127; + int16_t k2 = -32767; + int32_t k3 = -2147483647; + cols[0]->insert_data((const char*)&k1, sizeof(k1)); + cols[1]->insert_data((const char*)&k2, sizeof(k2)); + cols[2]->insert_data((const char*)&k3, sizeof(k3)); + } ASSERT_TRUE(ctx->memtable->insert(&block, {0}).ok()); } diff --git a/be/test/olap/rowset/group_rowset_writer_test.cpp b/be/test/olap/rowset/group_rowset_writer_test.cpp index c1aa21c6b4547d..46e4a74f8ad047 100644 --- a/be/test/olap/rowset/group_rowset_writer_test.cpp +++ b/be/test/olap/rowset/group_rowset_writer_test.cpp @@ -96,12 +96,15 @@ class GroupRowsetWriterTest : public testing::Test { Block create_block(int start_key, int num_rows) const { Block block = _tablet->tablet_schema()->create_block(); - auto columns = block.mutate_columns(); - for (int i = 0; i < num_rows; ++i) { - columns[0]->insert(Field::create_field(start_key + i)); - columns[1]->insert(Field::create_field((start_key + i) * 10)); + { + auto columns_guard = block.mutate_columns_scoped(); + auto& columns = columns_guard.mutable_columns(); + for (int i = 0; i < num_rows; ++i) { + columns[0]->insert(Field::create_field(start_key + i)); + columns[1]->insert( + Field::create_field((start_key + i) * 10)); + } } - block.set_columns(std::move(columns)); return block; } From a0c7d621341b63f81051e05c2b9b61c0c66636b8 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Thu, 21 May 2026 01:12:47 +0800 Subject: [PATCH 09/11] [fix](be) Detach nullable deserialize subcolumns ### What problem does this PR solve? Issue Number: close #xxx Related PR: #63001 Problem Summary: BE UT on PR #63001 failed in BlockTest.merge_returns_error_when_checked_string_append_exceeds_limit and aborted in ComplexTypeTest.DeserializeArrayWritesBackSharedNestedColumn. The block test relied on a string-overflow debug point that is not used by MutableBlock::merge(), so it expected an error on a path that legitimately succeeded. The complex-type test also constructed the destination array with a raw Int32 nested column, while DataTypeArray canonicalizes nested values as nullable and therefore calls DataTypeNullable::deserialize(). That exposed a real COW gap: DataTypeNullable::deserialize() wrote directly into the nested column and null map even when those subcolumns were still shared after a shallow COW clone. This patch detaches nullable nested and null-map owner slots before deserializing and writes them back through ColumnNullable::replace_columns(). It also updates the array COW deserialize test to use shared nullable subcolumns and changes the scoped block merge test to use a deterministic schema mismatch error. ### Release note None ### Check List (For Author) - Test: Unit Test - PATH=/tmp/codex-clang-format-16:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.bun/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.codex/tmp/arg0/codex-arg0qpYuvr:/mnt/disk6/common/node-v24.14.1-linux-x64/lib/node_modules/@openai/codex/node_modules/@openai/codex-linux-x64/vendor/x86_64-unknown-linux-musl/path:/mnt/disk3/zhaochangle/.bun/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk3/zhaochangle/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/usr/share/Modules/bin:/usr/lib64/ccache:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin build-support/clang-format.sh - PATH=/tmp/codex-clang-format-16:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.bun/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.codex/tmp/arg0/codex-arg0qpYuvr:/mnt/disk6/common/node-v24.14.1-linux-x64/lib/node_modules/@openai/codex/node_modules/@openai/codex-linux-x64/vendor/x86_64-unknown-linux-musl/path:/mnt/disk3/zhaochangle/.bun/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk3/zhaochangle/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/usr/share/Modules/bin:/usr/lib64/ccache:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin build-support/check-format.sh - ./run-be-ut.sh --run --filter=BlockTest.merge_returns_error_and_restores_output_block:BlockTest.merge_ignore_overflow_keeps_owned_accumulation_convertible:ComplexTypeTest.DeserializeArrayWritesBackSharedNestedColumn - ./run-be-ut.sh --run --filter=ComplexTypeTest.Deserialize* - Behavior changed: No - Does this need documentation: No --- be/src/core/column/column_nullable.cpp | 7 +++++++ be/src/core/column/column_nullable.h | 2 ++ be/src/core/data_type/data_type_nullable.cpp | 20 +++++++++++++------- be/test/core/block/block_test.cpp | 14 +++----------- be/test/core/data_type/complex_type_test.cpp | 14 +++++++++++--- 5 files changed, 36 insertions(+), 21 deletions(-) diff --git a/be/src/core/column/column_nullable.cpp b/be/src/core/column/column_nullable.cpp index 12ee5aff6d5740..ed6f5865543621 100644 --- a/be/src/core/column/column_nullable.cpp +++ b/be/src/core/column/column_nullable.cpp @@ -98,6 +98,13 @@ ColumnNullable::ColumnNullable(SharedTag, ColumnPtr nested_column_, ColumnPtr nu *static_cast(_null_map)); } +void ColumnNullable::replace_columns(ColumnPtr nested_column, ColumnPtr null_map) { + check_nullable_sizes(*nested_column, *null_map); + static_cast(_nested_column) = std::move(nested_column); + static_cast(_null_map) = std::move(null_map); + check_const_only_in_top_level(); +} + void ColumnNullable::shrink_padding_chars() { get_nested_column_ptr()->shrink_padding_chars(); } diff --git a/be/src/core/column/column_nullable.h b/be/src/core/column/column_nullable.h index a81e3e6e1c54a4..025e37976732e3 100644 --- a/be/src/core/column/column_nullable.h +++ b/be/src/core/column/column_nullable.h @@ -270,6 +270,8 @@ class ColumnNullable final : public COWHelper { // used in schema change void change_nested_column(ColumnPtr& other) { ((ColumnPtr&)_nested_column) = other; } + void replace_columns(ColumnPtr nested_column, ColumnPtr null_map); + /// Return the column that represents values. IColumn& get_nested_column() { return *_nested_column; } const IColumn& get_nested_column() const { return *_nested_column; } diff --git a/be/src/core/data_type/data_type_nullable.cpp b/be/src/core/data_type/data_type_nullable.cpp index 365dd86c4ee154..77250f5cdbe54a 100644 --- a/be/src/core/data_type/data_type_nullable.cpp +++ b/be/src/core/data_type/data_type_nullable.cpp @@ -107,24 +107,30 @@ const char* DataTypeNullable::deserialize(const char* buf, MutableColumnPtr* col size_t real_have_saved_num = 0; buf = deserialize_const_flag_and_row_num(buf, column, &real_have_saved_num); - auto* col = assert_cast(origin_column); - // null flags auto mem_size = real_have_saved_num * sizeof(bool); - col->get_null_map_data().resize(real_have_saved_num); + auto* col = assert_cast(origin_column); + // A nullable column can be exclusive while its subcolumns are still shared + // after a shallow COW clone. Detach both owner slots before writing into them. + const auto& const_col = *col; + auto nested = std::move(*const_col.get_nested_column_ptr()).mutate(); + auto null_map = std::move(*const_col.get_null_map_column_ptr()).mutate(); + auto& null_map_data = assert_cast(*null_map).get_data(); + + null_map_data.resize(real_have_saved_num); if (mem_size <= SERIALIZED_MEM_SIZE_LIMIT) { - memcpy(col->get_null_map_data().data(), buf, mem_size); + memcpy(null_map_data.data(), buf, mem_size); buf += mem_size; } else { size_t encode_size = unaligned_load(buf); buf += sizeof(size_t); // Throw exception if mem_size is large than UINT32_MAX - streamvbyte_decode((const uint8_t*)buf, (uint32_t*)(col->get_null_map_data().data()), + streamvbyte_decode((const uint8_t*)buf, (uint32_t*)(null_map_data.data()), cast_set(upper_int32(mem_size))); buf += encode_size; } - // column data values - auto nested = col->get_nested_column_ptr(); + buf = nested_data_type->deserialize(buf, &nested, be_exec_version); + col->replace_columns(std::move(nested), std::move(null_map)); return buf; } diff --git a/be/test/core/block/block_test.cpp b/be/test/core/block/block_test.cpp index 22b9916c0ced9a..bb60f2fabf7136 100644 --- a/be/test/core/block/block_test.cpp +++ b/be/test/core/block/block_test.cpp @@ -1003,26 +1003,18 @@ TEST(BlockTest, clear_blocks) { } } -TEST(BlockTest, merge_returns_error_when_checked_string_append_exceeds_limit) { +TEST(BlockTest, merge_returns_error_and_restores_output_block) { auto input_block = ColumnHelper::create_block(std::vector {"abcde", "fghij"}); + input_block.insert(ColumnHelper::create_column_with_name({1, 2})); auto output_block = ColumnHelper::create_block(std::vector {}); - const auto origin_enable_debug_points = config::enable_debug_points; - config::enable_debug_points = true; - DebugPoints::instance()->add_with_params(CONVERT_COLUMN_IF_OVERFLOW_DEBUG_POINT, - {{"max_string_size", "9"}}); - Defer defer([origin_enable_debug_points]() { - DebugPoints::instance()->remove(CONVERT_COLUMN_IF_OVERFLOW_DEBUG_POINT); - config::enable_debug_points = origin_enable_debug_points; - }); - auto status = [&]() { ScopedMutableBlock scoped_mutable_block(&output_block); return scoped_mutable_block.mutable_block().merge(input_block); }(); ASSERT_FALSE(status.ok()); - EXPECT_NE(status.to_string().find("string column length is too large"), std::string::npos) + EXPECT_NE(status.to_string().find("Merge block not match"), std::string::npos) << status.to_string(); ASSERT_EQ(output_block.rows(), 0); diff --git a/be/test/core/data_type/complex_type_test.cpp b/be/test/core/data_type/complex_type_test.cpp index ff9f66c36128d3..f4f9654a15a05b 100644 --- a/be/test/core/data_type/complex_type_test.cpp +++ b/be/test/core/data_type/complex_type_test.cpp @@ -27,6 +27,7 @@ #include "core/column/column.h" #include "core/column/column_array.h" #include "core/column/column_map.h" +#include "core/column/column_nullable.h" #include "core/column/column_string.h" #include "core/column/column_struct.h" #include "core/column/column_vector.h" @@ -108,23 +109,30 @@ TEST(ComplexTypeTest, DeserializeArrayWritesBackSharedNestedColumn) { src_column->insert(Field::create_field(Array {Field::create_field(3)})); auto buf = serialize_column(array_type, src_column->get_ptr()); - ColumnPtr shared_nested_column = ColumnInt32::create(); + ColumnPtr shared_nested_data_column = ColumnInt32::create(); + ColumnPtr shared_nested_null_map_column = ColumnUInt8::create(); + ColumnPtr shared_nested_column = + ColumnNullable::create(shared_nested_data_column, shared_nested_null_map_column); ColumnPtr shared_offsets_column = ColumnArray::ColumnOffsets::create(); MutableColumnPtr dst_column = ColumnArray::create(shared_nested_column, shared_offsets_column); deserialize_column(array_type, buf, &dst_column); const auto& array_column = assert_cast(*dst_column); EXPECT_EQ(2, array_column.size()); - EXPECT_EQ(0, shared_nested_column->size()); + EXPECT_EQ(0, shared_nested_data_column->size()); + EXPECT_EQ(0, shared_nested_null_map_column->size()); EXPECT_EQ(0, shared_offsets_column->size()); EXPECT_EQ(3, array_column.get_data().size()); EXPECT_EQ(2, array_column.get_offsets()[0]); EXPECT_EQ(3, array_column.get_offsets()[1]); - const auto& data = assert_cast(array_column.get_data()).get_data(); + const auto& nullable_data = assert_cast(array_column.get_data()); + const auto& data = + assert_cast(nullable_data.get_nested_column()).get_data(); EXPECT_EQ(1, data[0]); EXPECT_EQ(2, data[1]); EXPECT_EQ(3, data[2]); + EXPECT_FALSE(nullable_data.has_null()); } TEST(ComplexTypeTest, DeserializeMapWritesBackSharedKeyAndValueColumns) { From 9a6bd604931bf06c61fe0d245be31642592605ba Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Thu, 21 May 2026 02:05:08 +0800 Subject: [PATCH 10/11] [fix](be) Handle nullable array elements in foreach aggregate ### What problem does this PR solve? Issue Number: None Related PR: #63001 Problem Summary: DataTypeArray now normalizes array elements to nullable physical columns. AggregateFunctionForEach still passed the array data column directly into the nested aggregate when writing results, so nested aggregates such as array_agg could receive ColumnNullable where they expected ColumnArray. This commit unwraps the array element nullable wrapper for non-nullable nested aggregate results, maintains the element null map, and updates tests that still built DataTypeArray columns with the old raw nested shape. ### Release note None ### Check List (For Author) - Test: Unit Test - ./run-be-ut.sh --run --filter='FunctionVariantCast.*:AggregateFunctionArrayAggTest.*:VRetentionTest.*:SchemaUtilTest.TestArrayDimensions:SchemaUtilTest.TestCastColumnEdgeCases:DataTypeArrayTest.CreateColumnUsesNullableNestedColumn:AIFunctionTest.AIMaskTest:AIFunctionTest.AIExtractTest:AIFunctionTest.AIClassifyTest' - ./run-be-ut.sh --run --filter='AggGroupArrayIntersectTest.*:TableFunctionOperatorTest.block_fast_path_explode*' - PATH=/tmp/codex-clang-format-16:$PATH build-support/check-format.sh - Behavior changed: No - Does this need documentation: No --- .../aggregate/aggregate_function_foreach.h | 12 +++- be/test/ai/ai_function_test.cpp | 68 +++++++------------ .../core/data_type/data_type_array_test.cpp | 24 +++++++ be/test/exec/common/schema_util_test.cpp | 4 +- .../exprs/aggregate/agg_array_agg_test.cpp | 12 +++- .../exprs/aggregate/vec_retention_test.cpp | 67 ++++++++++-------- .../cast/function_variant_cast_test.cpp | 23 +++---- 7 files changed, 118 insertions(+), 92 deletions(-) diff --git a/be/src/exprs/aggregate/aggregate_function_foreach.h b/be/src/exprs/aggregate/aggregate_function_foreach.h index 8afd5b93eaaba8..a00dd9b9397944 100644 --- a/be/src/exprs/aggregate/aggregate_function_foreach.h +++ b/be/src/exprs/aggregate/aggregate_function_foreach.h @@ -210,11 +210,19 @@ class AggregateFunctionForEach : public AggregateFunctionNonFinalBase, auto& arr_to = assert_cast(to); auto& offsets_to = arr_to.get_offsets(); - IColumn& elems_to = arr_to.get_data(); + IColumn* elems_to = &arr_to.get_data(); + ColumnNullable* nullable_elems_to = nullptr; + if (!nested_function->get_return_type()->is_nullable()) { + nullable_elems_to = assert_cast(elems_to); + elems_to = nullable_elems_to->get_nested_column_ptr().get(); + } char* nested_state = state.array_of_aggregate_datas; for (size_t i = 0; i < state.dynamic_array_size; ++i) { - nested_function->insert_result_into(nested_state, elems_to); + nested_function->insert_result_into(nested_state, *elems_to); + if (nullable_elems_to != nullptr) { + nullable_elems_to->get_null_map_data().push_back(0); + } nested_state += nested_size_of_data; } diff --git a/be/test/ai/ai_function_test.cpp b/be/test/ai/ai_function_test.cpp index 7a1ecdc0c9a7fe..23855611861733 100644 --- a/be/test/ai/ai_function_test.cpp +++ b/be/test/ai/ai_function_test.cpp @@ -27,6 +27,7 @@ #include "core/block/block.h" #include "core/column/column_array.h" +#include "core/column/column_nullable.h" #include "core/column/column_string.h" #include "core/column/column_vector.h" #include "core/data_type/data_type_array.h" @@ -181,6 +182,28 @@ class OneShotHttpServer { std::thread _thread; }; +namespace { +MutableColumnPtr create_string_array_column(const std::vector>& rows) { + auto nested_column = ColumnString::create(); + auto null_map_column = ColumnUInt8::create(); + auto offsets_column = ColumnOffset64::create(); + + IColumn::Offset offset = 0; + for (const auto& row : rows) { + for (const auto& value : row) { + nested_column->insert_data(value.data(), value.size()); + null_map_column->insert_value(0); + } + offset += row.size(); + offsets_column->insert_value(offset); + } + + return ColumnArray::create( + ColumnNullable::create(std::move(nested_column), std::move(null_map_column)), + std::move(offsets_column)); +} +} // namespace + TEST(AIFunctionTest, AISummarizeTest) { FunctionAISummarize function; @@ -233,20 +256,7 @@ TEST(AIFunctionTest, AIMaskTest) { auto col_resource = ColumnHelper::create_column(resources); auto col_text = ColumnHelper::create_column(texts); - - auto nested_column = ColumnString::create(); - auto offsets_column = ColumnOffset64::create(); - - IColumn::Offset offset = 0; - for (const auto& row : labels) { - for (const auto& value : row) { - nested_column->insert_data(value.data(), value.size()); - } - offset += row.size(); - offsets_column->insert_value(offset); - } - - auto array_column = ColumnArray::create(std::move(nested_column), std::move(offsets_column)); + auto array_column = create_string_array_column(labels); Block block; block.insert({std::move(col_resource), std::make_shared(), "resource"}); @@ -315,20 +325,7 @@ TEST(AIFunctionTest, AIExtractTest) { auto col_resource = ColumnHelper::create_column(resources); auto col_text = ColumnHelper::create_column(texts); - - auto nested_column = ColumnString::create(); - auto offsets_column = ColumnOffset64::create(); - - IColumn::Offset offset = 0; - for (const auto& row : labels) { - for (const auto& value : row) { - nested_column->insert_data(value.data(), value.size()); - } - offset += row.size(); - offsets_column->insert_value(offset); - } - - auto array_column = ColumnArray::create(std::move(nested_column), std::move(offsets_column)); + auto array_column = create_string_array_column(labels); Block block; block.insert({std::move(col_resource), std::make_shared(), "resource"}); @@ -355,20 +352,7 @@ TEST(AIFunctionTest, AIClassifyTest) { auto col_resource = ColumnHelper::create_column(resources); auto col_text = ColumnHelper::create_column(texts); - - auto nested_column = ColumnString::create(); - auto offsets_column = ColumnOffset64::create(); - - IColumn::Offset offset = 0; - for (const auto& row : labels) { - for (const auto& value : row) { - nested_column->insert_data(value.data(), value.size()); - } - offset += row.size(); - offsets_column->insert_value(offset); - } - - auto array_column = ColumnArray::create(std::move(nested_column), std::move(offsets_column)); + auto array_column = create_string_array_column(labels); Block block; block.insert({std::move(col_resource), std::make_shared(), "resource"}); diff --git a/be/test/core/data_type/data_type_array_test.cpp b/be/test/core/data_type/data_type_array_test.cpp index ebc6f3eedb8d42..819bf33f227fa2 100644 --- a/be/test/core/data_type/data_type_array_test.cpp +++ b/be/test/core/data_type/data_type_array_test.cpp @@ -27,6 +27,9 @@ #include #include "core/column/column.h" +#include "core/column/column_array.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" #include "core/data_type/common_data_type_serder_test.h" #include "core/data_type/common_data_type_test.h" #include "core/data_type/data_type.h" @@ -420,6 +423,27 @@ TEST_F(DataTypeArrayTest, CreateColumnTest) { } } +TEST_F(DataTypeArrayTest, CreateColumnUsesNullableNestedColumn) { + auto nested_type = std::make_shared(); + auto array_type = std::make_shared(nested_type); + EXPECT_TRUE(array_type->get_nested_type()->is_nullable()); + + auto column = array_type->create_column(); + auto& array_column = assert_cast(*column); + auto& nested_column = assert_cast(array_column.get_data()); + array_column.insert(Field::create_field( + Array {Field::create_field(1), Field::create_field(2)})); + + EXPECT_EQ(1, array_column.size()); + EXPECT_EQ(2, nested_column.size()); + EXPECT_FALSE(nested_column.has_null()); + EXPECT_TRUE(array_type->check_column(*column).ok()); + + auto old_shape_column = + ColumnArray::create(ColumnInt32::create(), ColumnArray::ColumnOffsets::create()); + EXPECT_FALSE(array_type->check_column(*old_shape_column).ok()); +} + TEST_F(DataTypeArrayTest, GetFieldTest) { TExprNode node; node.node_type = TExprNodeType::ARRAY_LITERAL; diff --git a/be/test/exec/common/schema_util_test.cpp b/be/test/exec/common/schema_util_test.cpp index 0416311bd0c2a8..3599036fe2f76e 100644 --- a/be/test/exec/common/schema_util_test.cpp +++ b/be/test/exec/common/schema_util_test.cpp @@ -791,9 +791,7 @@ TEST_F(SchemaUtilTest, TestCastColumnEdgeCases) { auto variant_type = std::make_shared(10, false); auto nullable_array_type = make_nullable(std::make_shared(std::make_shared())); - auto array_column = - ColumnArray::create(ColumnInt32::create(), ColumnArray::ColumnOffsets::create()); - auto nullable_array_column = make_nullable(array_column->get_ptr()); + ColumnPtr nullable_array_column = nullable_array_type->create_column()->get_ptr(); ColumnWithTypeAndName array_col; array_col.type = nullable_array_type; diff --git a/be/test/exprs/aggregate/agg_array_agg_test.cpp b/be/test/exprs/aggregate/agg_array_agg_test.cpp index 6b27a2b55b03fd..d65565a4c99b18 100644 --- a/be/test/exprs/aggregate/agg_array_agg_test.cpp +++ b/be/test/exprs/aggregate/agg_array_agg_test.cpp @@ -150,9 +150,13 @@ TEST_F(AggregateFunctionArrayAggTest, test_array_agg_astr_foreach) { auto array_array_data_type = std::make_shared(array_data_type); auto array_array_off_column = ColumnOffset64::create(); array_array_off_column->insert_value(4); + auto nested_array_column = ColumnArray::create(data_column->clone(), off_column2->clone()); + auto nested_array_size = nested_array_column->size(); auto array_array_column = - ColumnArray::create(ColumnArray::create(data_column->clone(), off_column2->clone()), + ColumnArray::create(ColumnNullable::create(std::move(nested_array_column), + ColumnUInt8::create(nested_array_size, 0)), array_array_off_column->clone()); + ASSERT_TRUE(array_array_data_type->check_column(*array_array_column).ok()); execute(Block({ColumnWithTypeAndName(array_column->clone(), array_data_type, "")}), ColumnWithTypeAndName(std::move(array_array_column), array_array_data_type, "column")); @@ -185,9 +189,13 @@ TEST_F(AggregateFunctionArrayAggTest, test_array_agg_aint64_foreach) { auto array_array_data_type = std::make_shared(array_data_type); auto array_array_off_column = ColumnOffset64::create(); array_array_off_column->insert_value(4); + auto nested_array_column = ColumnArray::create(data_column->clone(), off_column2->clone()); + auto nested_array_size = nested_array_column->size(); auto array_array_column = - ColumnArray::create(ColumnArray::create(data_column->clone(), off_column2->clone()), + ColumnArray::create(ColumnNullable::create(std::move(nested_array_column), + ColumnUInt8::create(nested_array_size, 0)), array_array_off_column->clone()); + ASSERT_TRUE(array_array_data_type->check_column(*array_array_column).ok()); execute(Block({ColumnWithTypeAndName(array_column->clone(), array_data_type, "")}), ColumnWithTypeAndName(std::move(array_array_column), array_array_data_type, "column")); diff --git a/be/test/exprs/aggregate/vec_retention_test.cpp b/be/test/exprs/aggregate/vec_retention_test.cpp index ea22645fb327a5..21966fb7f986f8 100644 --- a/be/test/exprs/aggregate/vec_retention_test.cpp +++ b/be/test/exprs/aggregate/vec_retention_test.cpp @@ -25,6 +25,7 @@ #include "common/logging.h" #include "core/assert_cast.h" #include "core/column/column_array.h" +#include "core/column/column_nullable.h" #include "core/column/column_string.h" #include "core/column/column_vector.h" #include "core/data_type/data_type.h" @@ -43,6 +44,18 @@ namespace doris { void register_aggregate_function_retention(AggregateFunctionSimpleFactory& factory); +namespace { +ColumnArray& retention_result_array(IColumn& column) { + return assert_cast(column); +} + +ColumnUInt8::Container& retention_result_data(IColumn& column) { + auto& array = retention_result_array(column); + auto& nested = assert_cast(array.get_data()); + return assert_cast(nested.get_nested_column()).get_data(); +} +} // namespace + class VRetentionTest : public testing::Test { public: AggregateFunctionPtr agg_function; @@ -83,25 +96,23 @@ TEST_F(VRetentionTest, testEmpty) { agg_function->create(place2); agg_function->merge(place, place2, arena); - auto column_result = - ColumnArray::create(((DataTypePtr)std::make_shared())->create_column()); + auto column_result = agg_function->get_return_type()->create_column(); agg_function->insert_result_into(place, *column_result); - auto& result = assert_cast(column_result->get_data()).get_data(); + auto& result = retention_result_data(*column_result); for (int i = 0; i < result.size(); i++) { EXPECT_EQ(result[i], 0); } - auto column_result2 = - ColumnArray::create(((DataTypePtr)std::make_shared())->create_column()); + auto column_result2 = agg_function->get_return_type()->create_column(); agg_function->insert_result_into(place2, *column_result2); - auto& result2 = assert_cast(column_result2->get_data()).get_data(); + auto& result2 = retention_result_data(*column_result2); for (int i = 0; i < result2.size(); i++) { EXPECT_EQ(result2[i], 0); } - EXPECT_EQ(column_result2->get_offsets()[-1], 0); - EXPECT_EQ(column_result2->get_offsets()[0], 3); - EXPECT_EQ(column_result2->get_offsets().size(), 1); + EXPECT_EQ(retention_result_array(*column_result2).get_offsets()[-1], 0); + EXPECT_EQ(retention_result_array(*column_result2).get_offsets()[0], 3); + EXPECT_EQ(retention_result_array(*column_result2).get_offsets().size(), 1); agg_function->destroy(place); agg_function->destroy(place2); } @@ -141,17 +152,16 @@ TEST_F(VRetentionTest, testSample) { agg_function->merge(place2, place, arena); - auto column_result2 = - ColumnArray::create(((DataTypePtr)std::make_shared())->create_column()); + auto column_result2 = agg_function->get_return_type()->create_column(); agg_function->insert_result_into(place2, *column_result2); - auto& result2 = assert_cast(column_result2->get_data()).get_data(); + auto& result2 = retention_result_data(*column_result2); for (int i = 0; i < result2.size(); i++) { EXPECT_EQ(result2[i], 1); } - EXPECT_EQ(column_result2->get_offsets()[-1], 0); - EXPECT_EQ(column_result2->get_offsets()[0], 3); - EXPECT_EQ(column_result2->get_offsets().size(), 1); + EXPECT_EQ(retention_result_array(*column_result2).get_offsets()[-1], 0); + EXPECT_EQ(retention_result_array(*column_result2).get_offsets()[0], 3); + EXPECT_EQ(retention_result_array(*column_result2).get_offsets().size(), 1); agg_function->destroy(place2); } @@ -184,16 +194,15 @@ TEST_F(VRetentionTest, testNoMerge) { agg_function->add(place, column, i, arena); } - auto column_result = - ColumnArray::create(((DataTypePtr)std::make_shared())->create_column()); + auto column_result = agg_function->get_return_type()->create_column(); agg_function->insert_result_into(place, *column_result); - auto& result = assert_cast(column_result->get_data()).get_data(); + auto& result = retention_result_data(*column_result); for (int i = 0; i < result.size(); i++) { EXPECT_EQ(result[i], 1); } - EXPECT_EQ(column_result->get_offsets()[-1], 0); - EXPECT_EQ(column_result->get_offsets()[0], 3); - EXPECT_EQ(column_result->get_offsets().size(), 1); + EXPECT_EQ(retention_result_array(*column_result).get_offsets()[-1], 0); + EXPECT_EQ(retention_result_array(*column_result).get_offsets()[0], 3); + EXPECT_EQ(retention_result_array(*column_result).get_offsets().size(), 1); agg_function->destroy(place); } @@ -233,10 +242,9 @@ TEST_F(VRetentionTest, testSerialize) { VectorBufferReader buf_reader(buf.get_data_at(0)); agg_function->deserialize(place2, buf_reader, arena); - auto column_result = - ColumnArray::create(((DataTypePtr)std::make_shared())->create_column()); + auto column_result = agg_function->get_return_type()->create_column(); agg_function->insert_result_into(place2, *column_result); - auto& result = assert_cast(column_result->get_data()).get_data(); + auto& result = retention_result_data(*column_result); for (int i = 0; i < result.size(); i++) { if (i == 0) { EXPECT_EQ(result[i], 1); @@ -267,10 +275,9 @@ TEST_F(VRetentionTest, testSerialize) { agg_function->merge(place2, place3, arena); - auto column_result2 = - ColumnArray::create(((DataTypePtr)std::make_shared())->create_column()); + auto column_result2 = agg_function->get_return_type()->create_column(); agg_function->insert_result_into(place2, *column_result2); - auto& result2 = assert_cast(column_result2->get_data()).get_data(); + auto& result2 = retention_result_data(*column_result2); for (int i = 0; i < result2.size(); i++) { if (i == result2.size() - 1) { EXPECT_EQ(result2[i], 0); @@ -279,9 +286,9 @@ TEST_F(VRetentionTest, testSerialize) { } } - EXPECT_EQ(column_result2->get_offsets()[-1], 0); - EXPECT_EQ(column_result2->get_offsets()[0], 3); - EXPECT_EQ(column_result2->get_offsets().size(), 1); + EXPECT_EQ(retention_result_array(*column_result2).get_offsets()[-1], 0); + EXPECT_EQ(retention_result_array(*column_result2).get_offsets()[0], 3); + EXPECT_EQ(retention_result_array(*column_result2).get_offsets().size(), 1); agg_function->destroy(place2); agg_function->destroy(place3); diff --git a/be/test/exprs/function/cast/function_variant_cast_test.cpp b/be/test/exprs/function/cast/function_variant_cast_test.cpp index f48b213f86e524..960637bf1507d0 100644 --- a/be/test/exprs/function/cast/function_variant_cast_test.cpp +++ b/be/test/exprs/function/cast/function_variant_cast_test.cpp @@ -20,6 +20,7 @@ #include "common/status.h" #include "core/column/column_array.h" #include "core/column/column_decimal.h" +#include "core/column/column_nullable.h" #include "core/column/column_variant.h" #include "core/data_type/data_type_array.h" #include "core/data_type/data_type_decimal.h" @@ -132,15 +133,10 @@ TEST(FunctionVariantCast, CastToVariant) { { auto array_type = std::make_shared(std::make_shared()); auto variant_type = std::make_shared(); - auto array_col = - ColumnArray::create(ColumnInt32::create(), ColumnArray::ColumnOffsets::create()); - auto& data = assert_cast(array_col->get_data()); - auto& offsets = array_col->get_offsets(); - - data.insert(Field::create_field(1)); - data.insert(Field::create_field(2)); - data.insert(Field::create_field(3)); - offsets.push_back(3); + auto array_col = array_type->create_column(); + array_col->insert(Field::create_field( + Array {Field::create_field(1), Field::create_field(2), + Field::create_field(3)})); ColumnsWithTypeAndName arguments {{array_col->get_ptr(), array_type, "array_col"}, {nullptr, variant_type, "variant_type"}}; @@ -248,9 +244,7 @@ TEST(FunctionVariantCast, CastFromVariant) { auto variant_col = ColumnVariant::create(0, false); // Create a variant column with array values - variant_col->create_root( - array_type, - ColumnArray::create(ColumnInt32::create(), ColumnArray::ColumnOffsets::create())); + variant_col->create_root(array_type, array_type->create_column()); MutableColumnPtr data = variant_col->get_root(); Field a = Field::create_field(Array {Field::create_field(1), @@ -279,11 +273,14 @@ TEST(FunctionVariantCast, CastFromVariant) { const auto* array_result = assert_cast(remove_nullable(result_col).get()); ASSERT_EQ(array_result->size(), 1); - const auto& result_data = assert_cast(array_result->get_data()); + const auto& result_nullable = assert_cast(array_result->get_data()); + const auto& result_data = + assert_cast(result_nullable.get_nested_column()); ASSERT_EQ(result_data.size(), 3); ASSERT_EQ(result_data.get_element(0), 1); ASSERT_EQ(result_data.get_element(1), 2); ASSERT_EQ(result_data.get_element(2), 3); + ASSERT_FALSE(result_nullable.has_null()); } } From 53830dfdbd1b4b809065283a86171558ca74a451 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Sat, 23 May 2026 01:18:49 +0800 Subject: [PATCH 11/11] [fix](be) Adapt scoped rowid reads to batch API ### What problem does this PR solve? Issue Number: close #xxx Related PR: #63001 Problem Summary: After rebasing onto upstream master, Segment::seek_and_read_by_rowid expects a sorted vector of row ids instead of a single row id. The COW conflict resolution kept scoped block/column ownership but still called the old single-row form in rowid fetch and point query paths, causing BE compilation to fail. This updates both call sites to pass the batched row id vector while preserving scoped column ownership and restore-on-error behavior. ### Release note None ### Check List (For Author) - Test: Unit Test - ./build.sh --be - PATH=/tmp/codex-clang-format-16:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.bun/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.codex/tmp/arg0/codex-arg0DFZQLQ:/mnt/disk6/common/node-v24.14.1-linux-x64/lib/node_modules/@openai/codex/node_modules/@openai/codex-linux-x64/vendor/x86_64-unknown-linux-musl/codex-path:/mnt/disk3/zhaochangle/.bun/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/mnt/disk3/zhaochangle/.opencode/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk3/zhaochangle/.local/bin:/mnt/disk3/zhaochangle/bin:/mnt/disk6/common/apache-maven-3.9.14/bin:/mnt/disk6/common/ldb_toolchain_028/bin:/mnt/disk6/common/jdk-17.0.16/bin:/mnt/disk6/common/node-v24.14.1-linux-x64/bin:/usr/share/Modules/bin:/usr/lib64/ccache:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/tableau/tableau_server/packages/customer-bin.20251.25.0520.1026 build-support/check-format.sh - ./run-be-ut.sh --run --filter='FunctionVariantCast.*:AggregateFunctionArrayAggTest.*:VRetentionTest.*:SchemaUtilTest.TestArrayDimensions:SchemaUtilTest.TestCastColumnEdgeCases:DataTypeArrayTest.CreateColumnUsesNullableNestedColumn:AIFunctionTest.AIMaskTest:AIFunctionTest.AIExtractTest:AIFunctionTest.AIClassifyTest' - ./run-be-ut.sh --run --filter='AggGroupArrayIntersectTest.*:TableFunctionOperatorTest.block_fast_path_explode*' - ./run-be-ut.sh --run --filter='BlockTest.ClearSelectedColumnDataClonesSharedColumn:BlockTest.ClearColumnDataPropagatesSharedCloneEmptyFailure:BlockTest.ClearSelectedColumnDataPropagatesSharedCloneEmptyFailure:BlockTest.ScopedMutable*' - Behavior changed: No - Does this need documentation: No --- be/src/exec/rowid_fetcher.cpp | 8 +++----- be/src/service/point_query_executor.cpp | 9 +++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/be/src/exec/rowid_fetcher.cpp b/be/src/exec/rowid_fetcher.cpp index dc6d7822d354be..4322842792b9f6 100644 --- a/be/src/exec/rowid_fetcher.cpp +++ b/be/src/exec/rowid_fetcher.cpp @@ -1157,11 +1157,9 @@ Status RowIdStorageReader::read_doris_format_row( iterator_item.storage_read_options.stats = &stats; iterator_item.storage_read_options.io_ctx.reader_type = ReaderType::READER_QUERY; } - for (auto row_id : row_ids) { - RETURN_IF_ERROR(segment->seek_and_read_by_rowid( - full_read_schema, &slots[x], row_id, column, - iterator_item.storage_read_options, iterator_item.iterator)); - } + RETURN_IF_ERROR(segment->seek_and_read_by_rowid( + full_read_schema, &slots[x], row_ids, column, + iterator_item.storage_read_options, iterator_item.iterator)); } } return Status::OK(); diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index 5cc80919107632..433cabb777defe 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -560,16 +560,17 @@ Status PointQueryExecutor::_lookup_row_data() { const auto& segment = *it; for (int cid : _reusable->missing_col_uids()) { int pos = _reusable->get_col_uid_to_idx().at(cid); - auto row_id = static_cast(row_loc.row_id); + std::vector row_ids { + static_cast(row_loc.row_id)}; auto& column = result_columns[pos]; std::unique_ptr iter; SlotDescriptor* slot = _reusable->tuple_desc()->slots()[pos]; StorageReadOptions storage_read_options; storage_read_options.stats = &_read_stats; storage_read_options.io_ctx.reader_type = ReaderType::READER_QUERY; - auto st = - segment->seek_and_read_by_rowid(*_tablet->tablet_schema(), slot, row_id, - column, storage_read_options, iter); + auto st = segment->seek_and_read_by_rowid(*_tablet->tablet_schema(), slot, + row_ids, column, storage_read_options, + iter); if (st.ok() && _tablet->tablet_schema() ->column_by_uid(slot->col_unique_id()) .has_char_type()) {