Skip to content

Commit 9de800c

Browse files
author
nullccxsy
committed
fix: fix comments, add duplicate field ID detection, update lowercase conversion function
- Implemented detection for duplicate field IDs, Names in visitors, returning kInvalidSchema error with message. - Updated lowercase conversion in NameToIdVisitor
1 parent f0dbff4 commit 9de800c

File tree

3 files changed

+77
-54
lines changed

3 files changed

+77
-54
lines changed

src/iceberg/schema.cc

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include <format>
2424
#include <functional>
2525

26+
#include <iceberg/util/string_utils.h>
27+
2628
#include "iceberg/type.h"
2729
#include "iceberg/util/formatter.h" // IWYU pragma: keep
2830
#include "iceberg/util/macros.h"
@@ -46,10 +48,7 @@ class NameToIdVisitor {
4648
public:
4749
explicit NameToIdVisitor(
4850
std::unordered_map<std::string, int32_t>& name_to_id, bool case_sensitive_ = true,
49-
std::function<std::string(std::string_view)> quoting_func_ =
50-
[](std::string_view s) { return std::string(s); });
51-
~NameToIdVisitor();
52-
51+
std::function<std::string(std::string_view)> quoting_func = {});
5352
Status Visit(const ListType& type, const std::string& path,
5453
const std::string& short_path);
5554
Status Visit(const MapType& type, const std::string& path,
@@ -58,16 +57,16 @@ class NameToIdVisitor {
5857
const std::string& short_path);
5958
Status Visit(const PrimitiveType& type, const std::string& path,
6059
const std::string& short_path);
60+
void Finish();
6161

6262
private:
6363
std::string BuildPath(std::string_view prefix, std::string_view field_name,
6464
bool case_sensitive);
65-
void Merge();
6665

6766
private:
6867
bool case_sensitive_;
6968
std::unordered_map<std::string, int32_t>& name_to_id_;
70-
std::unordered_map<std::string, int32_t> shortname_to_id_;
69+
std::unordered_map<std::string, int32_t> short_name_to_id_;
7170
std::function<std::string(std::string_view)> quoting_func_;
7271
};
7372

@@ -98,9 +97,7 @@ Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFie
9897
return FindFieldById(it->second);
9998
}
10099
ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap());
101-
std::string lower_name(name);
102-
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
103-
auto it = lowercase_name_to_id_.find(lower_name);
100+
auto it = lowercase_name_to_id_.find(StringUtils::ToLower(name));
104101
if (it == lowercase_name_to_id_.end()) return std::nullopt;
105102
return FindFieldById(it->second);
106103
}
@@ -121,6 +118,7 @@ Status Schema::InitNameToIdMap() const {
121118
NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true);
122119
ICEBERG_RETURN_UNEXPECTED(
123120
VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
121+
visitor.Finish();
124122
return {};
125123
}
126124

@@ -131,6 +129,7 @@ Status Schema::InitLowerCaseNameToIdMap() const {
131129
NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false);
132130
ICEBERG_RETURN_UNEXPECTED(
133131
VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
132+
visitor.Finish();
134133
return {};
135134
}
136135

@@ -159,7 +158,10 @@ Status IdToFieldVisitor::VisitNestedType(const Type& type) {
159158
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
160159
const auto& fields = nested.fields();
161160
for (const auto& field : fields) {
162-
id_to_field_.emplace(field.field_id(), std::cref(field));
161+
auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field));
162+
if (!it.second) {
163+
return InvalidSchema("Duplicate field id found: {}", field.field_id());
164+
}
163165
ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
164166
}
165167
return {};
@@ -172,8 +174,6 @@ NameToIdVisitor::NameToIdVisitor(
172174
case_sensitive_(case_sensitive),
173175
quoting_func_(std::move(quoting_func)) {}
174176

175-
NameToIdVisitor::~NameToIdVisitor() { Merge(); }
176-
177177
Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
178178
const std::string& short_path) {
179179
const auto& field = type.fields()[0];
@@ -184,11 +184,12 @@ Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
184184
} else {
185185
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
186186
}
187-
auto it = name_to_id_.emplace(new_path, field.field_id());
187+
auto it = name_to_id_.try_emplace(new_path, field.field_id());
188188
if (!it.second) {
189-
return NotSupported("Duplicate path in name_to_id_: {}", new_path);
189+
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
190+
it.first->first, it.first->second, field.field_id());
190191
}
191-
shortname_to_id_.emplace(new_short_path, field.field_id());
192+
short_name_to_id_.try_emplace(new_short_path, field.field_id());
192193
ICEBERG_RETURN_UNEXPECTED(
193194
VisitTypeInline(*field.type(), this, new_path, new_short_path));
194195
return {};
@@ -206,11 +207,12 @@ Status NameToIdVisitor::Visit(const MapType& type, const std::string& path,
206207
} else {
207208
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
208209
}
209-
auto it = name_to_id_.emplace(new_path, field.field_id());
210+
auto it = name_to_id_.try_emplace(new_path, field.field_id());
210211
if (!it.second) {
211-
return NotSupported("Duplicate path in name_to_id_: {}", new_path);
212+
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
213+
it.first->first, it.first->second, field.field_id());
212214
}
213-
shortname_to_id_.emplace(new_short_path, field.field_id());
215+
short_name_to_id_.try_emplace(new_short_path, field.field_id());
214216
ICEBERG_RETURN_UNEXPECTED(
215217
VisitTypeInline(*field.type(), this, new_path, new_short_path));
216218
}
@@ -224,11 +226,12 @@ Status NameToIdVisitor::Visit(const StructType& type, const std::string& path,
224226
for (const auto& field : fields) {
225227
new_path = BuildPath(path, field.name(), case_sensitive_);
226228
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
227-
auto it = name_to_id_.emplace(new_path, field.field_id());
229+
auto it = name_to_id_.try_emplace(new_path, field.field_id());
228230
if (!it.second) {
229-
return NotSupported("Duplicate path in name_to_id_: {}", new_path);
231+
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
232+
it.first->first, it.first->second, field.field_id());
230233
}
231-
shortname_to_id_.emplace(new_short_path, field.field_id());
234+
short_name_to_id_.try_emplace(new_short_path, field.field_id());
232235
ICEBERG_RETURN_UNEXPECTED(
233236
VisitTypeInline(*field.type(), this, new_path, new_short_path));
234237
}
@@ -251,16 +254,14 @@ std::string NameToIdVisitor::BuildPath(std::string_view prefix,
251254
if (case_sensitive) {
252255
return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name;
253256
}
254-
std::string lower_name = quoted_name;
255-
std::ranges::transform(lower_name, lower_name.begin(), ::tolower);
256-
return prefix.empty() ? lower_name : std::string(prefix) + "." + lower_name;
257+
return prefix.empty() ? StringUtils::ToLower(quoted_name)
258+
: std::string(prefix) + "." + StringUtils::ToLower(quoted_name);
259+
;
257260
}
258261

259-
void NameToIdVisitor::Merge() {
260-
for (const auto& it : shortname_to_id_) {
261-
if (name_to_id_.find(it.first) == name_to_id_.end()) {
262-
name_to_id_[it.first] = it.second;
263-
}
262+
void NameToIdVisitor::Finish() {
263+
for (auto&& [path, id] : short_name_to_id_) {
264+
name_to_id_.try_emplace(path, id);
264265
}
265266
}
266267

src/iceberg/schema.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class ICEBERG_EXPORT Schema : public StructType {
5555

5656
[[nodiscard]] std::string ToString() const override;
5757

58-
///\brief Find thd SchemaField By field name
58+
/// \brief Find thd SchemaField By field name
5959
///
6060
/// Short names for maps and lists are included for any name that does not conflict with
6161
/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce

test/schema_test.cc

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424

2525
#include <gmock/gmock.h>
2626
#include <gtest/gtest.h>
27-
#include <iceberg/result.h>
2827

2928
#include "iceberg/schema_field.h"
3029
#include "iceberg/util/formatter.h" // IWYU pragma: keep
@@ -83,14 +82,14 @@ TEST(SchemaTest, Equality) {
8382
ASSERT_EQ(schema5, schema1);
8483
}
8584

86-
class NestedTypeTest : public ::testing::Test {
85+
class BasicShortNameTest : public ::testing::Test {
8786
protected:
8887
void SetUp() override {
8988
field1_ = std::make_unique<iceberg::SchemaField>(1, "Foo", iceberg::int32(), true);
9089
field2_ = std::make_unique<iceberg::SchemaField>(2, "Bar", iceberg::string(), true);
9190
field3_ = std::make_unique<iceberg::SchemaField>(3, "Foobar", iceberg::int32(), true);
9291

93-
iceberg::StructType structtype = iceberg::StructType(
92+
auto structtype = iceberg::StructType(
9493
std::vector<iceberg::SchemaField>{*field1_, *field2_, *field3_});
9594

9695
auto listype = iceberg::ListType(iceberg::SchemaField::MakeRequired(
@@ -123,7 +122,7 @@ class NestedTypeTest : public ::testing::Test {
123122
std::unique_ptr<iceberg::SchemaField> field7_;
124123
};
125124

126-
TEST_F(NestedTypeTest, TestFindById) {
125+
TEST_F(BasicShortNameTest, TestFindById) {
127126
ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(*field7_));
128127
ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(*field6_));
129128
ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(*field5_));
@@ -135,7 +134,7 @@ TEST_F(NestedTypeTest, TestFindById) {
135134
ASSERT_THAT(schema_->FindFieldById(10), ::testing::Optional(std::nullopt));
136135
}
137136

138-
TEST_F(NestedTypeTest, TestFindByName) {
137+
TEST_F(BasicShortNameTest, TestFindByName) {
139138
ASSERT_THAT(schema_->FindFieldByName("Value"), ::testing::Optional(*field7_));
140139
ASSERT_THAT(schema_->FindFieldByName("Value.value"), ::testing::Optional(*field6_));
141140
ASSERT_THAT(schema_->FindFieldByName("Value.key"), ::testing::Optional(*field5_));
@@ -152,7 +151,7 @@ TEST_F(NestedTypeTest, TestFindByName) {
152151
::testing::Optional(std::nullopt));
153152
}
154153

155-
TEST_F(NestedTypeTest, TestFindByNameCaseInsensitive) {
154+
TEST_F(BasicShortNameTest, TestFindByNameCaseInsensitive) {
156155
ASSERT_THAT(schema_->FindFieldByName("vALue", false), ::testing::Optional(*field7_));
157156
ASSERT_THAT(schema_->FindFieldByName("vALue.VALUE", false),
158157
::testing::Optional(*field6_));
@@ -170,7 +169,7 @@ TEST_F(NestedTypeTest, TestFindByNameCaseInsensitive) {
170169
::testing::Optional(std::nullopt));
171170
}
172171

173-
TEST_F(NestedTypeTest, TestFindByShortNameCaseInsensitive) {
172+
TEST_F(BasicShortNameTest, TestFindByShortNameCaseInsensitive) {
174173
ASSERT_THAT(schema_->FindFieldByName("vaLue.value.FOO", false),
175174
::testing::Optional(*field1_));
176175
ASSERT_THAT(schema_->FindFieldByName("Value.value.Bar", false),
@@ -181,21 +180,21 @@ TEST_F(NestedTypeTest, TestFindByShortNameCaseInsensitive) {
181180
::testing::Optional(std::nullopt));
182181
}
183182

184-
class NestType2Test : public ::testing::Test {
183+
class ComplexShortNameTest : public ::testing::Test {
185184
protected:
186185
void SetUp() override {
187186
field1_ = std::make_unique<iceberg::SchemaField>(1, "Foo", iceberg::int32(), true);
188187
field2_ = std::make_unique<iceberg::SchemaField>(2, "Bar", iceberg::string(), true);
189188
field3_ = std::make_unique<iceberg::SchemaField>(3, "Foobar", iceberg::int32(), true);
190189

191-
iceberg::StructType structtype = iceberg::StructType({*field1_, *field2_, *field3_});
190+
auto structtype = iceberg::StructType({*field1_, *field2_, *field3_});
192191

193192
field4_ = std::make_unique<iceberg::SchemaField>(
194193
4, "element", std::make_shared<iceberg::StructType>(structtype), false);
195194

196195
auto listype = iceberg::ListType(*field4_);
197196

198-
iceberg::StructType structtype2 = iceberg::StructType(
197+
auto structtype2 = iceberg::StructType(
199198
{iceberg::SchemaField::MakeRequired(5, "First_child", iceberg::int32()),
200199
iceberg::SchemaField::MakeRequired(
201200
6, "Second_child", std::make_shared<iceberg::ListType>(listype))});
@@ -231,7 +230,7 @@ class NestType2Test : public ::testing::Test {
231230
std::unique_ptr<iceberg::SchemaField> field9_;
232231
};
233232

234-
TEST_F(NestType2Test, TestFindById) {
233+
TEST_F(ComplexShortNameTest, TestFindById) {
235234
ASSERT_THAT(schema_->FindFieldById(9), ::testing::Optional(*field9_));
236235
ASSERT_THAT(schema_->FindFieldById(8), ::testing::Optional(*field8_));
237236
ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(*field7_));
@@ -245,7 +244,7 @@ TEST_F(NestType2Test, TestFindById) {
245244
ASSERT_THAT(schema_->FindFieldById(0), ::testing::Optional(std::nullopt));
246245
}
247246

248-
TEST_F(NestType2Test, TestFindByName) {
247+
TEST_F(ComplexShortNameTest, TestFindByName) {
249248
ASSERT_THAT(schema_->FindFieldByName("Map"), ::testing::Optional(*field9_));
250249
ASSERT_THAT(schema_->FindFieldByName("Map.value"), ::testing::Optional(*field8_));
251250
ASSERT_THAT(schema_->FindFieldByName("Map.key"), ::testing::Optional(*field7_));
@@ -265,7 +264,7 @@ TEST_F(NestType2Test, TestFindByName) {
265264
::testing::Optional(std::nullopt));
266265
}
267266

268-
TEST_F(NestType2Test, TestFindByNameCaseInsensitive) {
267+
TEST_F(ComplexShortNameTest, TestFindByNameCaseInsensitive) {
269268
ASSERT_THAT(schema_->FindFieldByName("map", false), ::testing::Optional(*field9_));
270269
ASSERT_THAT(schema_->FindFieldByName("map.vALUE", false),
271270
::testing::Optional(*field8_));
@@ -286,7 +285,7 @@ TEST_F(NestType2Test, TestFindByNameCaseInsensitive) {
286285
::testing::Optional(std::nullopt));
287286
}
288287

289-
TEST_F(NestType2Test, TestFindByShortName) {
288+
TEST_F(ComplexShortNameTest, TestFindByShortName) {
290289
ASSERT_THAT(schema_->FindFieldByName("Map.Second_child"),
291290
::testing::Optional(*field6_));
292291
ASSERT_THAT(schema_->FindFieldByName("Map.First_child"), ::testing::Optional(*field5_));
@@ -300,7 +299,7 @@ TEST_F(NestType2Test, TestFindByShortName) {
300299
::testing::Optional(std::nullopt));
301300
}
302301

303-
TEST_F(NestType2Test, TestFindByShortNameCaseInsensitive) {
302+
TEST_F(ComplexShortNameTest, TestFindByShortNameCaseInsensitive) {
304303
ASSERT_THAT(schema_->FindFieldByName("map.second_child", false),
305304
::testing::Optional(*field6_));
306305
ASSERT_THAT(schema_->FindFieldByName("map.first_child", false),
@@ -315,7 +314,7 @@ TEST_F(NestType2Test, TestFindByShortNameCaseInsensitive) {
315314
::testing::Optional(std::nullopt));
316315
}
317316

318-
class ComplexMapStructTest : public ::testing::Test {
317+
class ComplexMapStructShortNameTest : public ::testing::Test {
319318
protected:
320319
void SetUp() override {
321320
// Separate inner struct for key: {inner_key: int, inner_value: int}
@@ -400,7 +399,7 @@ class ComplexMapStructTest : public ::testing::Test {
400399
std::unique_ptr<iceberg::SchemaField> exp_field_a_;
401400
};
402401

403-
TEST_F(ComplexMapStructTest, TestFindById) {
402+
TEST_F(ComplexMapStructShortNameTest, TestFindById) {
404403
ASSERT_THAT(schema_->FindFieldById(20), ::testing::Optional(*exp_field_a_));
405404
ASSERT_THAT(schema_->FindFieldById(19), ::testing::Optional(*exp_map_value_));
406405
ASSERT_THAT(schema_->FindFieldById(18), ::testing::Optional(*exp_map_key_));
@@ -414,7 +413,7 @@ TEST_F(ComplexMapStructTest, TestFindById) {
414413
ASSERT_THAT(schema_->FindFieldById(10), ::testing::Optional(*exp_inner_key_key_));
415414
}
416415

417-
TEST_F(ComplexMapStructTest, TestFindByName) {
416+
TEST_F(ComplexMapStructShortNameTest, TestFindByName) {
418417
ASSERT_THAT(schema_->FindFieldByName("a"), ::testing::Optional(*exp_field_a_));
419418
ASSERT_THAT(schema_->FindFieldByName("a.key"), ::testing::Optional(*exp_map_key_));
420419
ASSERT_THAT(schema_->FindFieldByName("a.value"), ::testing::Optional(*exp_map_value_));
@@ -436,7 +435,7 @@ TEST_F(ComplexMapStructTest, TestFindByName) {
436435
::testing::Optional(*exp_inner_value_v_));
437436
}
438437

439-
TEST_F(ComplexMapStructTest, TestFindByNameCaseInsensitive) {
438+
TEST_F(ComplexMapStructShortNameTest, TestFindByNameCaseInsensitive) {
440439
ASSERT_THAT(schema_->FindFieldByName("A", false), ::testing::Optional(*exp_field_a_));
441440
ASSERT_THAT(schema_->FindFieldByName("A.KEY", false),
442441
::testing::Optional(*exp_map_key_));
@@ -460,7 +459,7 @@ TEST_F(ComplexMapStructTest, TestFindByNameCaseInsensitive) {
460459
::testing::Optional(*exp_inner_value_v_));
461460
}
462461

463-
TEST_F(ComplexMapStructTest, TestInvalidPaths) {
462+
TEST_F(ComplexMapStructShortNameTest, TestInvalidPaths) {
464463
ASSERT_THAT(schema_->FindFieldByName("a.invalid"), ::testing::Optional(std::nullopt));
465464
ASSERT_THAT(schema_->FindFieldByName("a.key.invalid"),
466465
::testing::Optional(std::nullopt));
@@ -480,9 +479,9 @@ TEST(SchemaTest, DuplicatePathErrorCaseSensitive) {
480479

481480
auto result = schema.FindFieldByName("a.b", /*case_sensitive=*/true);
482481
ASSERT_FALSE(result.has_value());
483-
EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kNotSupported);
482+
EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema);
484483
EXPECT_THAT(result.error().message,
485-
::testing::HasSubstr("Duplicate path in name_to_id_: a.b"));
484+
::testing::HasSubstr("Duplicate path found: a.b, prev id: 2, curr id: 3"));
486485
}
487486

488487
TEST(SchemaTest, DuplicatePathErrorCaseInsensitive) {
@@ -495,7 +494,30 @@ TEST(SchemaTest, DuplicatePathErrorCaseInsensitive) {
495494

496495
auto result = schema.FindFieldByName("A.B", /*case_sensitive=*/false);
497496
ASSERT_FALSE(result.has_value());
498-
EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kNotSupported);
497+
EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema);
499498
EXPECT_THAT(result.error().message,
500-
::testing::HasSubstr("Duplicate path in name_to_id_: a.b"));
499+
::testing::HasSubstr("Duplicate path found: a.b, prev id: 2, curr id: 3"));
500+
}
501+
502+
TEST(SchemaTest, NestedDuplicateFieldIdError) {
503+
// Outer struct with field ID 1
504+
iceberg::SchemaField outer_field(1, "outer", iceberg::int32(), true);
505+
506+
// Inner struct with duplicate field ID 1
507+
iceberg::SchemaField inner_field(1, "inner", iceberg::string(), true);
508+
auto inner_struct = iceberg::StructType({inner_field});
509+
510+
// Nested field with inner struct
511+
iceberg::SchemaField nested_field(
512+
2, "nested", std::make_shared<iceberg::StructType>(inner_struct), true);
513+
514+
// Schema with outer and nested fields
515+
iceberg::Schema schema({outer_field, nested_field}, 1);
516+
517+
// Attempt to find a field, which should trigger duplicate ID detection
518+
auto result = schema.FindFieldById(1);
519+
ASSERT_FALSE(result.has_value());
520+
EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema);
521+
EXPECT_THAT(result.error().message,
522+
::testing::HasSubstr("Duplicate field id found: 1"));
501523
}

0 commit comments

Comments
 (0)