Skip to content

Commit c0e3da5

Browse files
Backport ClickHouse#87136 to 25.8: Fix maps and arrays field ids reading parquet
1 parent 041b0fa commit c0e3da5

File tree

11 files changed

+129
-9
lines changed

11 files changed

+129
-9
lines changed

src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,7 +1121,7 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn(
11211121

11221122
auto nested_column = readColumnFromArrowColumn(arrow_nested_column,
11231123
column_name,
1124-
Nested::concatenateName(full_column_name, "list.element"),
1124+
full_column_name,
11251125
dictionary_infos,
11261126
nested_type_hint,
11271127
is_nested_nullable_column,
@@ -1211,12 +1211,9 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn(
12111211
{
12121212
chassert(clickhouse_columns_to_parquet);
12131213

1214-
auto column_to_search = full_column_name;
1215-
if (column_to_search.ends_with("list.element"))
1216-
column_to_search = column_to_search.substr(0, column_to_search.size() - 13);
12171214
/// Full name of the parquet column.
12181215
/// For example, if the column name is "a" and the field name in the structure is "b", the full name will be "a.b".
1219-
auto full_name = clickhouse_columns_to_parquet->at(column_to_search);
1216+
auto full_name = clickhouse_columns_to_parquet->at(full_column_name);
12201217
full_name += "." + field_name;
12211218
if (auto it = parquet_columns_to_clickhouse->find(full_name); it != parquet_columns_to_clickhouse->end())
12221219
{

src/Processors/Formats/Impl/ArrowFieldIndexUtil.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ class ArrowFieldIndexUtil
179179
const auto * map_type = static_cast<arrow::MapType *>(field_type.get());
180180
auto index_snapshot = current_start_index;
181181
current_start_index += countIndicesForType(map_type->key_type());
182-
calculateFieldIndices(*map_type->item_field(), field_name, current_start_index, result, name_prefix);
182+
calculateFieldIndices(*map_type->item_field(), Nested::concatenateName(field_name, "value"), current_start_index, result, name_prefix);
183183
index_info.first = index_snapshot;
184184
}
185185
else
@@ -217,7 +217,9 @@ class ArrowFieldIndexUtil
217217
if (clickhouse_to_parquet_names)
218218
{
219219
if (auto it = clickhouse_to_parquet_names->find(full_name); it != clickhouse_to_parquet_names->end())
220+
{
220221
full_name = it->second;
222+
}
221223
}
222224

223225
findRequiredIndices(Nested::concatenateName(name, field_name), full_name, header_index, field_type, field_indices, added_indices, required_indices, file, clickhouse_to_parquet_names);
@@ -233,7 +235,7 @@ class ArrowFieldIndexUtil
233235
else if (const auto * type_map = typeid_cast<const DB::DataTypeMap *>(nested_type.get()))
234236
{
235237
findRequiredIndices(name, transformed_name, header_index, type_map->getKeyType(), field_indices, added_indices, required_indices, file, clickhouse_to_parquet_names);
236-
findRequiredIndices(name, transformed_name, header_index, type_map->getValueType(), field_indices, added_indices, required_indices, file, clickhouse_to_parquet_names);
238+
findRequiredIndices(Nested::concatenateName(name, "value"), Nested::concatenateName(transformed_name, "value"), header_index, type_map->getValueType(), field_indices, added_indices, required_indices, file, clickhouse_to_parquet_names);
237239
return;
238240
}
239241
auto it = field_indices.find(transformed_name);

src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,15 @@ namespace ErrorCodes
6969

7070
namespace
7171
{
72+
String removeListElement(const String & value)
73+
{
74+
const String pattern = ".list.element";
75+
String result = value;
76+
size_t pos;
77+
while ((pos = result.find(pattern)) != std::string::npos)
78+
result.erase(pos, pattern.size());
79+
return result;
80+
}
7281

7382

7483
void traverseAllFields(const parquet::schema::NodePtr & node, std::unordered_map<Int64, String> & fields_mapping, const String & current_path = "")
@@ -80,7 +89,7 @@ void traverseAllFields(const parquet::schema::NodePtr & node, std::unordered_map
8089
traverseAllFields(group->field(i), fields_mapping, Nested::concatenateName(current_path, group->name()));
8190
}
8291
int field_id = node->field_id();
83-
fields_mapping[field_id] = Nested::concatenateName(current_path, node->name());
92+
fields_mapping[field_id] = removeListElement(Nested::concatenateName(current_path, node->name()));
8493
}
8594

8695
}

src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ void traverseComplexType(Poco::JSON::Object::Ptr type, std::unordered_map<String
7272
{
7373
auto element_id = type->getValue<Int64>(Iceberg::f_element_id);
7474
if (type->isObject(Iceberg::f_element))
75-
traverseComplexType(type->getObject(Iceberg::f_element), result, Nested::concatenateName(current_path, "list.element"));
75+
traverseComplexType(type->getObject(Iceberg::f_element), result, current_path);
7676
result[current_path] = element_id;
7777
return;
7878
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
2025-06-04 ('100022','2025-06-04 18:40:56.000000','2025-06-09 21:19:00.364000') 100022
2+
[1,2,3] {1:'Pavel',2:'Ivanov'} [(1,'pudge'),(2,'1000-7')]

tests/queries/0_stateless/03581_iceberg_struct_fields_ids.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@
44
-- This table was created by spark, but with changed column names in parquet
55
-- The purpose of this test is to verify that a column with a structured type with different names in Parquet and Iceberg metadata will be read correctly.
66
SELECT * FROM icebergS3(s3_conn, filename='field_ids_struct_test', SETTINGS iceberg_metadata_table_uuid = '149ecc15-7afc-4311-86b3-3a4c8d4ec08e') ORDER BY ALL;
7+
SELECT * FROM icebergS3(s3_conn, filename='field_ids_complex_test', SETTINGS iceberg_metadata_table_uuid = 'd4b695ca-ceeb-4537-8a2a-eee90dc6e313') ORDER BY ALL;
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
{
2+
"format-version" : 2,
3+
"table-uuid" : "d4b695ca-ceeb-4537-8a2a-eee90dc6e313",
4+
"location" : "s3a://test/field_ids_struct_test/metadata/field_ids_complex_test",
5+
"last-sequence-number" : 1,
6+
"last-updated-ms" : 1757661733693,
7+
"last-column-id" : 9,
8+
"current-schema-id" : 0,
9+
"schemas" : [ {
10+
"type" : "struct",
11+
"schema-id" : 0,
12+
"fields" : [ {
13+
"id" : 1,
14+
"name" : "col-1",
15+
"required" : false,
16+
"type" : {
17+
"type" : "list",
18+
"element-id" : 4,
19+
"element" : "long",
20+
"element-required" : false
21+
}
22+
}, {
23+
"id" : 2,
24+
"name" : "col-2",
25+
"required" : false,
26+
"type" : {
27+
"type" : "map",
28+
"key-id" : 5,
29+
"key" : "long",
30+
"value-id" : 6,
31+
"value" : "string",
32+
"value-required" : false
33+
}
34+
}, {
35+
"id" : 3,
36+
"name" : "col-3",
37+
"required" : false,
38+
"type" : {
39+
"type" : "list",
40+
"element-id" : 7,
41+
"element" : {
42+
"type" : "struct",
43+
"fields" : [ {
44+
"id" : 8,
45+
"name" : "id",
46+
"required" : false,
47+
"type" : "long"
48+
}, {
49+
"id" : 9,
50+
"name" : "name",
51+
"required" : false,
52+
"type" : "string"
53+
} ]
54+
},
55+
"element-required" : false
56+
}
57+
} ]
58+
} ],
59+
"default-spec-id" : 0,
60+
"partition-specs" : [ {
61+
"spec-id" : 0,
62+
"fields" : [ ]
63+
} ],
64+
"last-partition-id" : 999,
65+
"default-sort-order-id" : 0,
66+
"sort-orders" : [ {
67+
"order-id" : 0,
68+
"fields" : [ ]
69+
} ],
70+
"properties" : {
71+
"owner" : "scanhex12",
72+
"write.parquet.compression-codec" : "zstd"
73+
},
74+
"current-snapshot-id" : 607752583403487091,
75+
"refs" : {
76+
"main" : {
77+
"snapshot-id" : 607752583403487091,
78+
"type" : "branch"
79+
}
80+
},
81+
"snapshots" : [ {
82+
"sequence-number" : 1,
83+
"snapshot-id" : 607752583403487091,
84+
"timestamp-ms" : 1757661733693,
85+
"summary" : {
86+
"operation" : "append",
87+
"spark.app.id" : "local-1757661643417",
88+
"added-data-files" : "1",
89+
"added-records" : "1",
90+
"added-files-size" : "1957",
91+
"changed-partition-count" : "1",
92+
"total-records" : "1",
93+
"total-files-size" : "1957",
94+
"total-data-files" : "1",
95+
"total-delete-files" : "0",
96+
"total-position-deletes" : "0",
97+
"total-equality-deletes" : "0"
98+
},
99+
"manifest-list" : "s3a://test/field_ids_struct_test/metadata/field_ids_complex_test/metadata/snap-607752583403487091-1-140c8dff-1d83-4841-bc40-9aa85205b555.avro",
100+
"schema-id" : 0
101+
} ],
102+
"statistics" : [ ],
103+
"partition-statistics" : [ ],
104+
"snapshot-log" : [ {
105+
"timestamp-ms" : 1757661733693,
106+
"snapshot-id" : 607752583403487091
107+
} ],
108+
"metadata-log" : [ ]
109+
}

0 commit comments

Comments
 (0)