diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 675756681..c9fe7836c 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -23,6 +23,7 @@ set(ICEBERG_SOURCES expression/expression.cc file_reader.cc json_internal.cc + name_mapping.cc partition_field.cc partition_spec.cc schema.cc @@ -36,7 +37,6 @@ set(ICEBERG_SOURCES transform.cc transform_function.cc type.cc - snapshot.cc util/murmurhash3_internal.cc util/timepoint.cc) diff --git a/src/iceberg/name_mapping.cc b/src/iceberg/name_mapping.cc new file mode 100644 index 000000000..75a048260 --- /dev/null +++ b/src/iceberg/name_mapping.cc @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/name_mapping.h" + +#include +#include + +#include "iceberg/util/formatter_internal.h" + +namespace iceberg { + +namespace { + +// Helper function to join a list of field names with a dot +std::string JoinByDot(std::span parts) { + std::stringstream ss; + for (size_t i = 0; i < parts.size(); ++i) { + if (i > 0) { + ss << "."; + } + ss << parts[i]; + } + return ss.str(); +} + +// Helper class to recursively index MappedField by field id +struct IndexByIdVisitor { + std::unordered_map field_by_id; + + void Visit(const MappedField& field) { + field_by_id.emplace(field.field_id, std::cref(field)); + if (field.nested_mapping != nullptr) { + Visit(*field.nested_mapping); + } + } + + void Visit(const MappedFields& fields) { + for (const auto& field : fields.fields()) { + Visit(field); + } + } + + void Visit(const NameMapping& name_mapping) { Visit(name_mapping.AsMappedFields()); } +}; + +// Helper class to recursively index MappedField by field name +struct IndexByNameVisitor { + std::unordered_map field_by_name; + + void Visit(const MappedField& field) { + for (const auto& name : field.names) { + field_by_name.emplace(name, std::cref(field)); + } + + if (field.nested_mapping != nullptr) { + IndexByNameVisitor nested_visitor; + nested_visitor.Visit(*field.nested_mapping); + + for (const auto& [name, mapped_field] : nested_visitor.field_by_name) { + for (const auto& prefix : field.names) { + std::vector parts = {prefix, name}; + field_by_name.emplace(JoinByDot(parts), std::cref(mapped_field)); + } + } + } + } + + void Visit(const MappedFields& fields) { + for (const auto& field : fields.fields()) { + Visit(field); + } + } + + void Visit(const NameMapping& name_mapping) { Visit(name_mapping.AsMappedFields()); } +}; + +} // namespace + +MappedFields::MappedFields(std::vector fields) + : fields_(std::move(fields)) {} + +std::unique_ptr MappedFields::Make(std::vector fields) { + return std::unique_ptr(new MappedFields(std::move(fields))); +} + +std::optional MappedFields::Field(int32_t id) const { + const auto& id_to_field = LazyIdToField(); + if (auto it = id_to_field.find(id); it != id_to_field.cend()) { + return it->second; + } + return std::nullopt; +} + +std::optional MappedFields::Id(std::string_view name) const { + const auto& name_to_id = LazyNameToId(); + if (auto it = name_to_id.find(name); it != name_to_id.cend()) { + return it->second; + } + return std::nullopt; +} + +size_t MappedFields::Size() const { return fields_.size(); } + +std::span MappedFields::fields() const { return fields_; } + +const std::unordered_map& MappedFields::LazyNameToId() const { + if (name_to_id_.empty() && !fields_.empty()) { + for (const auto& field : fields_) { + for (const auto& name : field.names) { + name_to_id_.emplace(name, field.field_id); + } + } + } + return name_to_id_; +} + +const std::unordered_map& MappedFields::LazyIdToField() + const { + if (id_to_field_.empty() && !fields_.empty()) { + for (const auto& field : fields_) { + id_to_field_.emplace(field.field_id, std::cref(field)); + } + } + return id_to_field_; +} + +NameMapping::NameMapping(std::unique_ptr mapping) + : mapping_(std::move(mapping)) {} + +std::optional NameMapping::Find(int32_t id) { + const auto& fields_by_id = LazyFieldsById(); + if (auto iter = fields_by_id.find(id); iter != fields_by_id.cend()) { + return iter->second; + } + return std::nullopt; +} + +std::optional NameMapping::Find(std::span names) { + if (names.empty()) { + return std::nullopt; + } + return Find(JoinByDot(names)); +} + +std::optional NameMapping::Find(const std::string& name) { + const auto& fields_by_name = LazyFieldsByName(); + if (auto iter = fields_by_name.find(name); iter != fields_by_name.cend()) { + return iter->second; + } + return std::nullopt; +} + +const MappedFields& NameMapping::AsMappedFields() const { + if (mapping_ == nullptr) { + const static std::unique_ptr kEmptyFields = MappedFields::Make({}); + return *kEmptyFields; + } + return *mapping_; +} + +const std::unordered_map& NameMapping::LazyFieldsById() + const { + if (fields_by_id_.empty()) { + IndexByIdVisitor visitor; + visitor.Visit(AsMappedFields()); + fields_by_id_ = std::move(visitor.field_by_id); + } + return fields_by_id_; +} + +const std::unordered_map& +NameMapping::LazyFieldsByName() const { + if (fields_by_name_.empty()) { + IndexByNameVisitor visitor; + visitor.Visit(AsMappedFields()); + fields_by_name_ = std::move(visitor.field_by_name); + } + return fields_by_name_; +} + +std::unique_ptr NameMapping::MakeEmpty() { + return std::unique_ptr(new NameMapping(MappedFields::Make({}))); +} + +std::unique_ptr NameMapping::Make(std::unique_ptr fields) { + return std::unique_ptr(new NameMapping(std::move(fields))); +} + +std::unique_ptr NameMapping::Make(std::vector fields) { + return Make(MappedFields::Make(std::move(fields))); +} + +bool operator==(const MappedField& lhs, const MappedField& rhs) { + if (lhs.field_id != rhs.field_id) { + return false; + } + if (lhs.names != rhs.names) { + return false; + } + if (lhs.nested_mapping == nullptr && rhs.nested_mapping == nullptr) { + return true; + } + if (lhs.nested_mapping == nullptr || rhs.nested_mapping == nullptr) { + return false; + } + return *lhs.nested_mapping == *rhs.nested_mapping; +} + +bool operator==(const MappedFields& lhs, const MappedFields& rhs) { + if (lhs.Size() != rhs.Size()) { + return false; + } + auto lhs_fields = lhs.fields(); + auto rhs_fields = rhs.fields(); + for (size_t i = 0; i < lhs.Size(); ++i) { + if (lhs_fields[i] != rhs_fields[i]) { + return false; + } + } + return true; +} + +bool operator==(const NameMapping& lhs, const NameMapping& rhs) { + return lhs.AsMappedFields() == rhs.AsMappedFields(); +} + +std::string ToString(const MappedField& field) { + return std::format( + "({} -> {}{})", field.names, field.field_id, + field.nested_mapping ? std::format(", {}", ToString(*field.nested_mapping)) : ""); +} + +std::string ToString(const MappedFields& fields) { + return std::format("{}", fields.fields()); +} + +std::string ToString(const NameMapping& name_mapping) { + const auto& fields = name_mapping.AsMappedFields(); + if (fields.Size() == 0) { + return "[]"; + } + std::string repr = "[\n"; + for (const auto& field : fields.fields()) { + std::format_to(std::back_inserter(repr), " {}\n", ToString(field)); + } + repr += "]"; + return repr; +} + +} // namespace iceberg diff --git a/src/iceberg/name_mapping.h b/src/iceberg/name_mapping.h new file mode 100644 index 000000000..c6f389cd6 --- /dev/null +++ b/src/iceberg/name_mapping.h @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" + +namespace iceberg { + +/// \brief An immutable mapping between a field ID and a set of names. +/// +/// This class is trivial enough that we don't need any function. +struct ICEBERG_EXPORT MappedField { + /// \brief A required list of 0 or more names for a field. + std::unordered_set names; + /// \brief An optional Iceberg field ID used when a field's name is present in `names`. + /// TODO(gangwu): check if we need to make it optional + int32_t field_id; + /// \brief An optional list of field mappings for child field of structs, maps, and + /// lists. + std::unique_ptr nested_mapping; + + friend bool operator==(const MappedField& lhs, const MappedField& rhs); +}; + +using MappedFieldConstRef = std::reference_wrapper; + +/// \brief A list of field mappings for child field of structs, maps, and lists. +class ICEBERG_EXPORT MappedFields { + public: + /// \brief Create a new MappedFields instance. + /// \param[in] fields The list of field mappings. + /// \return A new MappedFields instance. + static std::unique_ptr Make(std::vector fields); + + /// \brief Get the field for a given field ID. + /// \param[in] id The ID of the field. + /// \return The field for the given field ID. + std::optional Field(int32_t id) const; + + /// \brief Get the field ID for a given field name. + /// \param[in] name The name of the field. + /// \return The field ID of the field. + std::optional Id(std::string_view name) const; + + /// \brief Get the number of field mappings. + size_t Size() const; + + /// \brief Get the list of field mappings. + std::span fields() const; + + friend bool operator==(const MappedFields& lhs, const MappedFields& rhs); + + MappedFields(const MappedFields& other) = delete; + MappedFields& operator=(const MappedFields& other) = delete; + + MappedFields(MappedFields&& other) noexcept = default; + MappedFields& operator=(MappedFields&& other) noexcept = default; + + private: + explicit MappedFields(std::vector fields); + + const std::unordered_map& LazyNameToId() const; + const std::unordered_map& LazyIdToField() const; + + private: + std::vector fields_; + + // Lazy-initialized mappings + mutable std::unordered_map name_to_id_; + mutable std::unordered_map id_to_field_; +}; + +/// \brief Represents a mapping from external schema names to Iceberg type IDs. +class ICEBERG_EXPORT NameMapping { + public: + /// \brief Create a new NameMapping instance. + static std::unique_ptr Make(std::unique_ptr fields); + + /// \brief Create a new NameMapping instance. + static std::unique_ptr Make(std::vector fields); + + /// \brief Create an empty NameMapping instance. + static std::unique_ptr MakeEmpty(); + + /// \brief Find a field by its ID. + std::optional Find(int32_t id); + + /// \brief Find a field by its unconcatenated names. + std::optional Find(std::span names); + + /// \brief Find a field by its (concatenated) name. + std::optional Find(const std::string& name); + + /// \brief Get the underlying MappedFields instance. + const MappedFields& AsMappedFields() const; + + friend bool operator==(const NameMapping& lhs, const NameMapping& rhs); + + private: + explicit NameMapping(std::unique_ptr mapping); + + const std::unordered_map& LazyFieldsById() const; + const std::unordered_map& LazyFieldsByName() const; + + private: + std::unique_ptr mapping_; + + // Lazy-initialized mappings + mutable std::unordered_map fields_by_id_; + mutable std::unordered_map fields_by_name_; +}; + +ICEBERG_EXPORT std::string ToString(const MappedField& field); +ICEBERG_EXPORT std::string ToString(const MappedFields& fields); +ICEBERG_EXPORT std::string ToString(const NameMapping& mapping); + +} // namespace iceberg diff --git a/src/iceberg/util/formatter_internal.h b/src/iceberg/util/formatter_internal.h index f802bbecd..6601e1c99 100644 --- a/src/iceberg/util/formatter_internal.h +++ b/src/iceberg/util/formatter_internal.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "iceberg/util/formatter.h" @@ -70,7 +71,7 @@ std::string FormatRange(const Range& range, std::string_view separator, if (!first) { ss << separator; } - ss << element; + ss << std::format("{}", element); first = false; } @@ -119,3 +120,29 @@ struct std::formatter> : std::formatter { FormatRange(formatted_range, ", ", "[", "]"), ctx); } }; + +/// \brief std::formatter specialization for std::span +template +struct std::formatter> : std::formatter { + template + auto format(const std::span& span, FormatContext& ctx) const { + auto formatted_range = + span | std::views::transform([](const auto& item) { return FormatItem(item); }); + return std::formatter::format( + FormatRange(formatted_range, ", ", "[", "]"), ctx); + } +}; + +/// \brief std::formatter specialization for std::unordered_set +template +struct std::formatter> + : std::formatter { + template + auto format(const std::unordered_set& set, + FormatContext& ctx) const { + auto formatted_range = + set | std::views::transform([](const auto& item) { return FormatItem(item); }); + return std::formatter::format( + FormatRange(formatted_range, ", ", "[", "]"), ctx); + } +}; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bf8caa0b0..6454d389d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -30,7 +30,8 @@ configure_file("${CMAKE_SOURCE_DIR}/test/test_config.h.in" add_executable(schema_test) target_sources(schema_test - PRIVATE schema_test.cc + PRIVATE name_mapping_test.cc + schema_test.cc schema_field_test.cc type_test.cc transform_test.cc diff --git a/test/name_mapping_test.cc b/test/name_mapping_test.cc new file mode 100644 index 000000000..d9d054823 --- /dev/null +++ b/test/name_mapping_test.cc @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/name_mapping.h" + +#include +#include +#include +#include + +#include +#include +namespace iceberg { + +class NameMappingTest : public ::testing::Test { + protected: + std::unique_ptr MakeNameMapping() { + std::vector fields; + fields.emplace_back(MappedField{.names = {"foo", "bar"}, .field_id = 1}); + fields.emplace_back(MappedField{.names = {"baz"}, .field_id = 2}); + + std::vector nested_fields; + nested_fields.emplace_back(MappedField{.names = {"hello"}, .field_id = 4}); + nested_fields.emplace_back(MappedField{.names = {"world"}, .field_id = 5}); + auto nested_mapping = MappedFields::Make(std::move(nested_fields)); + fields.emplace_back(MappedField{ + .names = {"qux"}, .field_id = 3, .nested_mapping = std::move(nested_mapping)}); + + return NameMapping::Make(std::move(fields)); + } +}; + +TEST_F(NameMappingTest, FindById) { + auto mapping = MakeNameMapping(); + + struct Param { + int32_t field_id; + bool should_have_value; + std::unordered_set names; + }; + + const std::vector params = { + {.field_id = 1, .should_have_value = true, .names = {"foo", "bar"}}, + {.field_id = 2, .should_have_value = true, .names = {"baz"}}, + {.field_id = 3, .should_have_value = true, .names = {"qux"}}, + {.field_id = 4, .should_have_value = true, .names = {"hello"}}, + {.field_id = 5, .should_have_value = true, .names = {"world"}}, + {.field_id = 999, .should_have_value = false, .names = {}}, + }; + + for (const auto& param : params) { + auto field = mapping->Find(param.field_id); + if (param.should_have_value) { + ASSERT_TRUE(field.has_value()); + EXPECT_EQ(field->get().field_id, param.field_id); + EXPECT_THAT(field->get().names, testing::UnorderedElementsAreArray(param.names)); + } else { + EXPECT_FALSE(field.has_value()); + } + } +} + +TEST_F(NameMappingTest, FindByName) { + auto mapping = MakeNameMapping(); + + struct Param { + std::string name; + bool should_have_value; + int32_t field_id; + }; + + const std::vector params = { + {.name = "foo", .should_have_value = true, .field_id = 1}, + {.name = "bar", .should_have_value = true, .field_id = 1}, + {.name = "baz", .should_have_value = true, .field_id = 2}, + {.name = "qux", .should_have_value = true, .field_id = 3}, + {.name = "qux.hello", .should_have_value = true, .field_id = 4}, + {.name = "qux.world", .should_have_value = true, .field_id = 5}, + {.name = "non_existent", .should_have_value = false, .field_id = -1}, + }; + + for (const auto& param : params) { + auto field = mapping->Find(param.name); + if (param.should_have_value) { + ASSERT_TRUE(field.has_value()); + EXPECT_EQ(field->get().field_id, param.field_id); + } else { + EXPECT_FALSE(field.has_value()); + } + } +} + +TEST_F(NameMappingTest, FindByNameParts) { + auto mapping = MakeNameMapping(); + + struct Param { + std::vector names; + bool should_have_value; + int32_t field_id; + }; + + std::vector params = { + {.names = {"foo"}, .should_have_value = true, .field_id = 1}, + {.names = {"bar"}, .should_have_value = true, .field_id = 1}, + {.names = {"baz"}, .should_have_value = true, .field_id = 2}, + {.names = {"qux"}, .should_have_value = true, .field_id = 3}, + {.names = {"qux", "hello"}, .should_have_value = true, .field_id = 4}, + {.names = {"qux", "world"}, .should_have_value = true, .field_id = 5}, + {.names = {"non_existent"}, .should_have_value = false, .field_id = -1}, + }; + + for (const auto& param : params) { + auto field = mapping->Find(param.names); + if (param.should_have_value) { + ASSERT_TRUE(field.has_value()); + EXPECT_EQ(field->get().field_id, param.field_id); + } else { + EXPECT_FALSE(field.has_value()); + } + } +} + +TEST_F(NameMappingTest, Equality) { + auto mapping1 = MakeNameMapping(); + auto mapping2 = MakeNameMapping(); + auto empty_mapping = NameMapping::MakeEmpty(); + + EXPECT_EQ(*mapping1, *mapping2); + EXPECT_NE(*mapping1, *empty_mapping); + + std::vector fields; + fields.emplace_back( + MappedField{.names = {"different"}, .field_id = 99, .nested_mapping = nullptr}); + auto different_mapping = NameMapping::Make(MappedFields::Make(std::move(fields))); + + EXPECT_NE(*mapping1, *different_mapping); +} + +TEST_F(NameMappingTest, MappedFieldsAccess) { + auto mapping = MakeNameMapping(); + const auto& fields = mapping->AsMappedFields(); + EXPECT_EQ(fields.Size(), 3); + + struct Param { + int32_t field_id; + std::unordered_set names; + }; + + const std::vector params = { + {.field_id = 1, .names = {"foo", "bar"}}, + {.field_id = 2, .names = {"baz"}}, + {.field_id = 3, .names = {"qux"}}, + }; + + for (const auto& param : params) { + auto field = fields.Field(param.field_id); + ASSERT_TRUE(field.has_value()); + EXPECT_THAT(field->get().names, testing::UnorderedElementsAreArray(param.names)); + } +} + +TEST_F(NameMappingTest, ToString) { + { + std::vector fields; + fields.emplace_back(MappedField{.names = {"foo"}, .field_id = 1}); + + std::vector nested_fields; + nested_fields.emplace_back(MappedField{.names = {"hello"}, .field_id = 3}); + nested_fields.emplace_back(MappedField{.names = {"world"}, .field_id = 4}); + auto nested_mapping = MappedFields::Make(std::move(nested_fields)); + fields.emplace_back(MappedField{ + .names = {"bar"}, .field_id = 2, .nested_mapping = std::move(nested_mapping)}); + + auto mapping = NameMapping::Make(std::move(fields)); + + auto expected = R"([ + ([foo] -> 1) + ([bar] -> 2, [([hello] -> 3), ([world] -> 4)]) +])"; + EXPECT_EQ(ToString(*mapping), expected); + } + + { + auto empty_mapping = NameMapping::MakeEmpty(); + EXPECT_EQ(ToString(*empty_mapping), "[]"); + } +} + +} // namespace iceberg