Skip to content

Commit ac6b984

Browse files
committed
Fix some tests's bugs of external table.
1 parent fbfcc16 commit ac6b984

File tree

8 files changed

+274
-168
lines changed

8 files changed

+274
-168
lines changed

be/src/vec/exec/format/orc/vorc_reader.cpp

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1984,12 +1984,9 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name,
19841984
}
19851985
} else {
19861986
// Normal processing: convert ORC column to Doris column
1987-
auto status = _orc_column_to_doris_column<false>(
1987+
RETURN_IF_ERROR(_orc_column_to_doris_column<false>(
19881988
key_col_name, doris_key_column, doris_key_type, root_node->get_key_node(),
1989-
orc_key_type, orc_map->keys.get(), element_size);
1990-
if (!status.ok()) {
1991-
return status;
1992-
}
1989+
orc_key_type, orc_map->keys.get(), element_size));
19931990
}
19941991

19951992
// Handle value column: if still missing, fill with default values
@@ -2005,14 +2002,14 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name,
20052002
} else {
20062003
mutable_value_column->insert_many_defaults(element_size);
20072004
}
2008-
return Status::OK();
20092005
} else {
20102006
// Normal processing: convert ORC column to Doris column
2011-
return _orc_column_to_doris_column<false>(value_col_name, doris_value_column,
2012-
doris_value_type, root_node->get_value_node(),
2013-
orc_value_type, orc_map->elements.get(),
2014-
element_size);
2007+
RETURN_IF_ERROR(_orc_column_to_doris_column<false>(
2008+
value_col_name, doris_value_column, doris_value_type,
2009+
root_node->get_value_node(), orc_value_type, orc_map->elements.get(),
2010+
element_size));
20152011
}
2012+
return doris_map.deduplicate_keys();
20162013
}
20172014
case PrimitiveType::TYPE_STRUCT: {
20182015
if (orc_column_type->getKind() != orc::TypeKind::STRUCT) {

be/src/vec/exec/format/table/hive/hive_orc_nested_column_utils.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,27 @@ void HiveOrcNestedColumnUtils::extract_nested_column_ids(
9595
} else if (i == 1) {
9696
child_field_name = "VALUES";
9797
}
98+
99+
// Special handling for Orc MAP structure:
100+
// When accessing only VALUES, we still need KEY structure for levels
101+
// Check if we're at key child (i==0) and only VALUES is requested (no KEYS)
102+
if (i == 0) {
103+
bool has_keys_access = child_paths_by_table_col_name.find("KEYS") !=
104+
child_paths_by_table_col_name.end();
105+
bool has_values_access = child_paths_by_table_col_name.find("VALUES") !=
106+
child_paths_by_table_col_name.end();
107+
108+
// If only VALUES is accessed (not KEYS), still include key structure for deduplicate_keys
109+
if (!has_keys_access && has_values_access) {
110+
uint64_t key_start_id = child->getColumnId();
111+
uint64_t key_max_id = child->getMaximumColumnId();
112+
for (uint64_t id = key_start_id; id <= key_max_id; ++id) {
113+
column_ids.insert(id);
114+
}
115+
has_child_columns = true;
116+
continue; // Skip further processing of key child
117+
}
118+
}
98119
break;
99120
default:
100121
child_field_name = "";

be/src/vec/exec/format/table/hive_reader.cpp

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,6 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
166166
}
167167
const orc::Type* orc_field = it->second;
168168

169-
const auto& all_access_paths = slot->all_access_paths();
170-
171169
// primitive (non-nested) types: direct mapping by name
172170
if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
173171
slot->col_type() != TYPE_MAP)) {
@@ -179,13 +177,13 @@ ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
179177
}
180178

181179
// complex types:
182-
183-
// collect and process all_access_paths -> column_ids
180+
const auto& all_access_paths = slot->all_access_paths();
184181
process_access_paths(orc_field, all_access_paths, column_ids);
185182

186-
// collect and process predicate_access_paths -> filter_column_ids
187183
const auto& predicate_access_paths = slot->predicate_access_paths();
188-
process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
184+
if (!predicate_access_paths.empty()) {
185+
process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
186+
}
189187
}
190188

191189
return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
@@ -261,8 +259,6 @@ ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
261259
}
262260
const orc::Type* orc_field = it->second;
263261

264-
const auto& all_access_paths = slot->all_access_paths();
265-
266262
// primitive (non-nested) types: direct mapping by pos
267263
if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
268264
slot->col_type() != TYPE_MAP)) {
@@ -273,14 +269,14 @@ ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
273269
continue;
274270
}
275271

272+
const auto& all_access_paths = slot->all_access_paths();
276273
// complex types
277-
278-
// collect and process all_access_paths -> column_ids
279274
process_access_paths(orc_field, all_access_paths, column_ids);
280275

281-
// collect and process predicate_access_paths -> filter_column_ids
282276
const auto& predicate_access_paths = slot->predicate_access_paths();
283-
process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
277+
if (!predicate_access_paths.empty()) {
278+
process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
279+
}
284280
}
285281

286282
return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
@@ -438,8 +434,6 @@ ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* fiel
438434
}
439435
auto field_schema = it->second;
440436

441-
const auto& all_access_paths = slot->all_access_paths();
442-
443437
// primitive (non-nested) types: direct mapping by name
444438
if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
445439
slot->col_type() != TYPE_MAP)) {
@@ -452,13 +446,13 @@ ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* fiel
452446
}
453447

454448
// complex types:
455-
456-
// collect and process all_access_paths -> column_ids
449+
const auto& all_access_paths = slot->all_access_paths();
457450
process_access_paths(field_schema, all_access_paths, column_ids);
458451

459-
// collect and process predicate_access_paths -> filter_column_ids
460452
const auto& predicate_access_paths = slot->predicate_access_paths();
461-
process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
453+
if (!predicate_access_paths.empty()) {
454+
process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
455+
}
462456
}
463457

464458
return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
@@ -538,8 +532,6 @@ ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
538532
}
539533
auto field_schema = it->second;
540534

541-
const auto& all_access_paths = slot->all_access_paths();
542-
543535
// primitive (non-nested) types: direct mapping by position
544536
if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
545537
slot->col_type() != TYPE_MAP)) {
@@ -551,12 +543,14 @@ ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
551543
continue;
552544
}
553545

554-
// collect and process all_access_paths -> column_ids
546+
// complex types:
547+
const auto& all_access_paths = slot->all_access_paths();
555548
process_access_paths(field_schema, all_access_paths, column_ids);
556549

557-
// collect and process predicate_access_paths -> filter_column_ids
558550
const auto& predicate_access_paths = slot->predicate_access_paths();
559-
process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
551+
if (!predicate_access_paths.empty()) {
552+
process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
553+
}
560554
}
561555

562556
return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));

be/src/vec/exec/format/table/iceberg/iceberg_orc_nested_column_utils.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,26 @@ void IcebergOrcNestedColumnUtils::extract_nested_column_ids(
9797
} else if (i == 1) {
9898
child_field_id = "VALUES";
9999
}
100+
// Special handling for Orc MAP structure:
101+
// When accessing only VALUES, we still need KEY structure for levels
102+
// Check if we're at key child (i==0) and only VALUES is requested (no KEYS)
103+
if (i == 0) {
104+
bool has_keys_access =
105+
child_paths_by_field_id.find("KEYS") != child_paths_by_field_id.end();
106+
bool has_values_access =
107+
child_paths_by_field_id.find("VALUES") != child_paths_by_field_id.end();
108+
109+
// If only VALUES is accessed (not KEYS), still include key structure for deduplicate_keys
110+
if (!has_keys_access && has_values_access) {
111+
uint64_t key_start_id = child->getColumnId();
112+
uint64_t key_max_id = child->getMaximumColumnId();
113+
for (uint64_t id = key_start_id; id <= key_max_id; ++id) {
114+
column_ids.insert(id);
115+
}
116+
has_child_columns = true;
117+
continue; // Skip further processing of key child
118+
}
119+
}
100120
break;
101121
default:
102122
child_field_id = "";

be/src/vec/exec/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,10 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids(
114114
uint64_t key_start_id = child.get_column_id();
115115
uint64_t key_max_id = child.get_max_column_id();
116116
for (uint64_t id = key_start_id; id <= key_max_id; ++id) {
117-
column_ids.insert(id);
117+
auto inserted = column_ids.insert(id);
118+
if (inserted.second) {
119+
std::cout << "[IcebergNested] added column id: " << id << std::endl;
120+
}
118121
}
119122
has_child_columns = true;
120123
continue; // Skip further processing of key child
@@ -144,7 +147,10 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids(
144147
uint64_t start_id = child.get_column_id();
145148
uint64_t max_column_id = child.get_max_column_id();
146149
for (uint64_t id = start_id; id <= max_column_id; ++id) {
147-
column_ids.insert(id);
150+
auto inserted = column_ids.insert(id);
151+
if (inserted.second) {
152+
std::cout << "[IcebergNested] added column id: " << id << std::endl;
153+
}
148154
}
149155
has_child_columns = true;
150156
} else {
@@ -166,7 +172,11 @@ void IcebergParquetNestedColumnUtils::extract_nested_column_ids(
166172
// This ensures parent struct/container nodes are included when their children are needed
167173
if (has_child_columns) {
168174
// Set automatically handles deduplication, so no need to check if it already exists
169-
column_ids.insert(field_schema.get_column_id());
175+
auto inserted = column_ids.insert(field_schema.get_column_id());
176+
if (inserted.second) {
177+
std::cout << "[IcebergNested] added parent column id: " << field_schema.get_column_id()
178+
<< std::endl;
179+
}
170180
}
171181
}
172182

be/src/vec/exec/format/table/iceberg_reader.cpp

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -544,8 +544,6 @@ ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* f
544544
}
545545
auto field_schema = it->second;
546546

547-
const auto& all_access_paths = slot->all_access_paths();
548-
549547
// primitive (non-nested) types: direct mapping by name
550548
if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
551549
slot->col_type() != TYPE_MAP)) {
@@ -558,13 +556,13 @@ ColumnIdResult IcebergParquetReader::_create_column_ids(const FieldDescriptor* f
558556
}
559557

560558
// complex types:
561-
562-
// collect and process all_access_paths -> column_ids
559+
const auto& all_access_paths = slot->all_access_paths();
563560
process_access_paths(field_schema, all_access_paths, column_ids);
564561

565-
// collect and process predicate_access_paths -> filter_column_ids
566562
const auto& predicate_access_paths = slot->predicate_access_paths();
567-
process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
563+
if (!predicate_access_paths.empty()) {
564+
process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
565+
}
568566
}
569567
return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
570568
}
@@ -737,8 +735,6 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type,
737735
}
738736
const orc::Type* orc_field = it->second;
739737

740-
const auto& all_access_paths = slot->all_access_paths();
741-
742738
// primitive (non-nested) types: direct mapping by name
743739
if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
744740
slot->col_type() != TYPE_MAP)) {
@@ -749,14 +745,14 @@ ColumnIdResult IcebergOrcReader::_create_column_ids(const orc::Type* orc_type,
749745
continue;
750746
}
751747

752-
// nested types:
753-
754-
// collect and process all_access_paths -> column_ids
748+
// complex types:
749+
const auto& all_access_paths = slot->all_access_paths();
755750
process_access_paths(orc_field, all_access_paths, column_ids);
756751

757-
// collect and process predicate_access_paths -> filter_column_ids
758752
const auto& predicate_access_paths = slot->predicate_access_paths();
759-
process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
753+
if (!predicate_access_paths.empty()) {
754+
process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
755+
}
760756
}
761757

762758
return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));

0 commit comments

Comments
 (0)