Skip to content

Commit 5a38bd5

Browse files
committed
feat(parquet): add HasFieldIds check
1 parent 4fb89f8 commit 5a38bd5

File tree

5 files changed

+89
-8
lines changed

5 files changed

+89
-8
lines changed

src/iceberg/parquet/parquet_reader.cc

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,7 @@ Result<SchemaProjection> BuildProjection(::parquet::arrow::FileReader* reader,
6161
const Schema& read_schema) {
6262
auto metadata = reader->parquet_reader()->metadata();
6363

64-
ICEBERG_ASSIGN_OR_RAISE(auto has_field_ids,
65-
HasFieldIds(metadata->schema()->schema_root()));
66-
if (!has_field_ids) {
64+
if (!HasFieldIds(metadata->schema()->schema_root())) {
6765
// TODO(gangwu): apply name mapping to Parquet schema
6866
return NotImplemented("Applying name mapping to Parquet schema is not implemented");
6967
}

src/iceberg/parquet/parquet_schema_util.cc

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717
* under the License.
1818
*/
1919

20+
#include <parquet/schema.h>
21+
2022
#include "iceberg/parquet/parquet_schema_util_internal.h"
23+
#include "iceberg/util/checked_cast.h"
2124

2225
namespace iceberg::parquet {
2326

@@ -30,8 +33,21 @@ Result<std::vector<int>> SelectedColumnIndices(const SchemaProjection& projectio
3033
return NotImplemented("NYI");
3134
}
3235

33-
Result<bool> HasFieldIds(const ::parquet::schema::NodePtr& root_node) {
34-
return NotImplemented("NYI");
36+
bool HasFieldIds(const ::parquet::schema::NodePtr& node) {
37+
if (node->field_id() >= 0) {
38+
return true;
39+
}
40+
41+
if (node->is_group()) {
42+
auto group_node = internal::checked_pointer_cast<::parquet::schema::GroupNode>(node);
43+
for (int i = 0; i < group_node->field_count(); i++) {
44+
if (HasFieldIds(group_node->field(i))) {
45+
return true;
46+
}
47+
}
48+
}
49+
50+
return false;
3551
}
3652

3753
} // namespace iceberg::parquet

src/iceberg/parquet/parquet_schema_util_internal.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ Result<std::vector<int>> SelectedColumnIndices(const SchemaProjection& projectio
4747
/// \brief Check whether the Parquet schema has field IDs.
4848
///
4949
/// \param root_node The root node of the Parquet schema.
50-
/// \return True if the Parquet schema has field IDs, false otherwise. Return error if
51-
/// the Parquet schema has partial field IDs.
52-
Result<bool> HasFieldIds(const ::parquet::schema::NodePtr& root_node);
50+
/// \return True if the Parquet schema has field IDs, false otherwise.
51+
bool HasFieldIds(const ::parquet::schema::NodePtr& root_node);
5352

5453
} // namespace iceberg::parquet

test/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,4 +116,6 @@ if(ICEBERG_BUILD_BUNDLE)
116116
SOURCES
117117
test_common.cc
118118
in_memory_catalog_test.cc)
119+
120+
add_iceberg_test(parquet_test USE_BUNDLE SOURCES parquet_schema_test.cc)
119121
endif()

test/parquet_schema_test.cc

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include <gtest/gtest.h>
21+
#include <parquet/schema.h>
22+
#include <parquet/types.h>
23+
24+
#include "iceberg/parquet/parquet_schema_util_internal.h"
25+
26+
namespace iceberg::parquet {
27+
28+
namespace {
29+
30+
::parquet::schema::NodePtr MakeInt32Node(const std::string& name, int field_id = -1) {
31+
return ::parquet::schema::PrimitiveNode::Make(
32+
name, ::parquet::Repetition::REQUIRED, ::parquet::LogicalType::None(),
33+
::parquet::Type::INT32, /*primitive_length=*/-1, field_id);
34+
}
35+
36+
::parquet::schema::NodePtr MakeGroupNode(const std::string& name,
37+
const ::parquet::schema::NodeVector& fields,
38+
int field_id = -1) {
39+
return ::parquet::schema::GroupNode::Make(name, ::parquet::Repetition::REQUIRED, fields,
40+
/*logical_type=*/nullptr, field_id);
41+
}
42+
43+
} // namespace
44+
45+
TEST(HasFieldIds, PrimitiveNode) {
46+
EXPECT_FALSE(HasFieldIds(MakeInt32Node("test_field")));
47+
EXPECT_TRUE(HasFieldIds(MakeInt32Node("test_field", /*field_id=*/1)));
48+
}
49+
50+
TEST(HasFieldIds, GroupNode) {
51+
auto group_node_without_field_id =
52+
MakeGroupNode("test_group", {MakeInt32Node("c1"), MakeInt32Node("c2")});
53+
EXPECT_FALSE(HasFieldIds(group_node_without_field_id));
54+
55+
auto group_node_with_full_field_id = MakeGroupNode(
56+
"test_group",
57+
{MakeInt32Node("c1", /*field_id=*/2), MakeInt32Node("c2", /*field_id=*/3)},
58+
/*field_id=*/1);
59+
EXPECT_TRUE(HasFieldIds(group_node_with_full_field_id));
60+
61+
auto group_node_with_partial_field_id = MakeGroupNode(
62+
"test_group", {MakeInt32Node("c1", /*field_id=*/1), MakeInt32Node("c2")});
63+
EXPECT_TRUE(HasFieldIds(group_node_with_partial_field_id));
64+
}
65+
66+
} // namespace iceberg::parquet

0 commit comments

Comments
 (0)