diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index 2e254e5d3..e19f5b4e4 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -1209,4 +1209,20 @@ Result> TableMetadataFromJson(const nlohmann::jso return table_metadata; } +Result FromJsonString(const std::string& json_string) { + try { + return nlohmann::json::parse(json_string); + } catch (const std::exception& e) { + return JsonParseError("Failed to parse JSON string: {}", e.what()); + } +} + +Result ToJsonString(const nlohmann::json& json) { + try { + return json.dump(); + } catch (const std::exception& e) { + return JsonParseError("Failed to serialize to JSON string: {}", e.what()); + } +} + } // namespace iceberg diff --git a/src/iceberg/json_internal.h b/src/iceberg/json_internal.h index 33f47b3da..ce7afd010 100644 --- a/src/iceberg/json_internal.h +++ b/src/iceberg/json_internal.h @@ -244,4 +244,16 @@ nlohmann::json ToJson(const TableMetadata& table_metadata); /// \return A `TableMetadata` object or an error if the conversion fails. Result> TableMetadataFromJson(const nlohmann::json& json); +/// \brief Deserialize a JSON string into a `nlohmann::json` object. +/// +/// \param json_string The JSON string to deserialize. +/// \return A `nlohmann::json` object or an error if the deserialization fails. +Result FromJsonString(const std::string& json_string); + +/// \brief Serialize a `nlohmann::json` object into a JSON string. +/// +/// \param json The `nlohmann::json` object to serialize. +/// \return A JSON string or an error if the serialization fails. +Result ToJsonString(const nlohmann::json& json); + } // namespace iceberg diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index 5640939c2..c1ada81d9 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -23,10 +23,17 @@ #include #include +#include + +#include "iceberg/file_io.h" +#include "iceberg/json_internal.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/schema.h" +#include "iceberg/snapshot.h" #include "iceberg/sort_order.h" +#include "iceberg/util/macros.h" + namespace iceberg { std::string ToString(const SnapshotLogEntry& entry) { @@ -69,4 +76,100 @@ Result> TableMetadata::SortOrder() const { return *iter; } +namespace { + +template +bool SharedPtrVectorEquals(const std::vector>& lhs, + const std::vector>& rhs) { + if (lhs.size() != rhs.size()) { + return false; + } + for (size_t i = 0; i < lhs.size(); ++i) { + if (*lhs[i] != *rhs[i]) { + return false; + } + } + return true; +} + +bool SnapshotRefEquals( + const std::unordered_map>& lhs, + const std::unordered_map>& rhs) { + if (lhs.size() != rhs.size()) { + return false; + } + for (const auto& [key, value] : lhs) { + auto iter = rhs.find(key); + if (iter == rhs.end()) { + return false; + } + if (*iter->second != *value) { + return false; + } + } + return true; +} + +} // namespace + +bool operator==(const TableMetadata& lhs, const TableMetadata& rhs) { + return lhs.format_version == rhs.format_version && lhs.table_uuid == rhs.table_uuid && + lhs.location == rhs.location && + lhs.last_sequence_number == rhs.last_sequence_number && + lhs.last_updated_ms == rhs.last_updated_ms && + lhs.last_column_id == rhs.last_column_id && + lhs.current_schema_id == rhs.current_schema_id && + SharedPtrVectorEquals(lhs.schemas, rhs.schemas) && + lhs.default_spec_id == rhs.default_spec_id && + lhs.last_partition_id == rhs.last_partition_id && + lhs.properties == rhs.properties && + lhs.current_snapshot_id == rhs.current_snapshot_id && + SharedPtrVectorEquals(lhs.snapshots, rhs.snapshots) && + lhs.snapshot_log == rhs.snapshot_log && lhs.metadata_log == rhs.metadata_log && + SharedPtrVectorEquals(lhs.sort_orders, rhs.sort_orders) && + lhs.default_sort_order_id == rhs.default_sort_order_id && + SnapshotRefEquals(lhs.refs, rhs.refs) && + SharedPtrVectorEquals(lhs.statistics, rhs.statistics) && + SharedPtrVectorEquals(lhs.partition_statistics, rhs.partition_statistics) && + lhs.next_row_id == rhs.next_row_id; +} + +Result TableMetadataUtil::CodecFromFileName( + std::string_view file_name) { + if (file_name.find(".metadata.json") == std::string::npos) { + return InvalidArgument("{} is not a valid metadata file", file_name); + } + + // We have to be backward-compatible with .metadata.json.gz files + if (file_name.ends_with(".metadata.json.gz")) { + return MetadataFileCodecType::kGzip; + } + + std::string_view file_name_without_suffix = + file_name.substr(0, file_name.find_last_of(".metadata.json")); + if (file_name_without_suffix.ends_with(".gz")) { + return MetadataFileCodecType::kGzip; + } + return MetadataFileCodecType::kNone; +} + +Result> TableMetadataUtil::Read( + FileIO& io, const std::string& location, std::optional length) { + ICEBERG_ASSIGN_OR_RAISE(auto codec_type, CodecFromFileName(location)); + if (codec_type == MetadataFileCodecType::kGzip) { + return NotImplemented("Reading gzip-compressed metadata files is not supported yet"); + } + + ICEBERG_ASSIGN_OR_RAISE(auto content, io.ReadFile(location, length)); + ICEBERG_ASSIGN_OR_RAISE(auto json, FromJsonString(content)); + return TableMetadataFromJson(json); +} + +Status TableMetadataUtil::Write(FileIO& io, const std::string& location, + const TableMetadata& metadata) { + auto json = ToJson(metadata); + ICEBERG_ASSIGN_OR_RAISE(auto json_string, ToJsonString(json)); + return io.WriteFile(location, json_string); +} + } // namespace iceberg diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index e993e275d..12483140d 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -132,6 +133,12 @@ struct ICEBERG_EXPORT TableMetadata { Result> PartitionSpec() const; /// \brief Get the current sort order, return NotFoundError if not found Result> SortOrder() const; + + friend bool operator==(const TableMetadata& lhs, const TableMetadata& rhs); + + friend bool operator!=(const TableMetadata& lhs, const TableMetadata& rhs) { + return !(lhs == rhs); + } }; /// \brief Returns a string representation of a SnapshotLogEntry @@ -140,4 +147,37 @@ ICEBERG_EXPORT std::string ToString(const SnapshotLogEntry& entry); /// \brief Returns a string representation of a MetadataLogEntry ICEBERG_EXPORT std::string ToString(const MetadataLogEntry& entry); +/// \brief The codec type of the table metadata file. +ICEBERG_EXPORT enum class MetadataFileCodecType { + kNone, + kGzip, +}; + +/// \brief Utility class for table metadata +struct ICEBERG_EXPORT TableMetadataUtil { + /// \brief Get the codec type from the table metadata file name. + /// + /// \param file_name The name of the table metadata file. + /// \return The codec type of the table metadata file. + static Result CodecFromFileName(std::string_view file_name); + + /// \brief Read the table metadata file. + /// + /// \param io The file IO to use to read the table metadata. + /// \param location The location of the table metadata file. + /// \param length The optional length of the table metadata file. + /// \return The table metadata. + static Result> Read( + class FileIO& io, const std::string& location, + std::optional length = std::nullopt); + + /// \brief Write the table metadata to a file. + /// + /// \param io The file IO to use to write the table metadata. + /// \param location The location of the table metadata file. + /// \param metadata The table metadata to write. + static Status Write(FileIO& io, const std::string& location, + const TableMetadata& metadata); +}; + } // namespace iceberg diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c31c6e0e8..7a1293745 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -69,7 +69,8 @@ if(ICEBERG_BUILD_BUNDLE) add_test(NAME avro_test COMMAND avro_test) add_executable(arrow_test) - target_sources(arrow_test PRIVATE arrow_test.cc arrow_fs_file_io_test.cc) + target_sources(arrow_test PRIVATE arrow_test.cc arrow_fs_file_io_test.cc + metadata_io_test.cc) target_link_libraries(arrow_test PRIVATE iceberg_bundle_static GTest::gtest_main GTest::gmock) add_test(NAME arrow_test COMMAND arrow_test) diff --git a/test/arrow_fs_file_io_test.cc b/test/arrow_fs_file_io_test.cc index 193c13df3..ab382b39e 100644 --- a/test/arrow_fs_file_io_test.cc +++ b/test/arrow_fs_file_io_test.cc @@ -19,45 +19,48 @@ #include "iceberg/arrow/arrow_fs_file_io.h" -#include - #include #include #include "matchers.h" +#include "temp_file_test_base.h" namespace iceberg { -class LocalFileIOTest : public testing::Test { +class LocalFileIOTest : public TempFileTestBase { protected: void SetUp() override { - local_fs_ = std::make_shared<::arrow::fs::LocalFileSystem>(); - file_io_ = std::make_shared(local_fs_); + TempFileTestBase::SetUp(); + file_io_ = std::make_shared( + std::make_shared<::arrow::fs::LocalFileSystem>()); + temp_filepath_ = CreateNewTempFilePath(); } - std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_; std::shared_ptr file_io_; - std::filesystem::path tmpfile = std::filesystem::temp_directory_path() / "123.txt"; + std::string temp_filepath_; }; TEST_F(LocalFileIOTest, ReadWriteFile) { - auto read_res = file_io_->ReadFile(tmpfile.string(), std::nullopt); + auto read_res = file_io_->ReadFile(temp_filepath_, std::nullopt); EXPECT_THAT(read_res, IsError(ErrorKind::kIOError)); EXPECT_THAT(read_res, HasErrorMessage("Failed to open local file")); - auto write_res = file_io_->WriteFile(tmpfile.string(), "hello world"); + auto write_res = file_io_->WriteFile(temp_filepath_, "hello world"); EXPECT_THAT(write_res, IsOk()); - read_res = file_io_->ReadFile(tmpfile.string(), std::nullopt); + read_res = file_io_->ReadFile(temp_filepath_, std::nullopt); EXPECT_THAT(read_res, IsOk()); EXPECT_THAT(read_res, HasValue(::testing::Eq("hello world"))); } TEST_F(LocalFileIOTest, DeleteFile) { - auto del_res = file_io_->DeleteFile(tmpfile.string()); + auto write_res = file_io_->WriteFile(temp_filepath_, "hello world"); + EXPECT_THAT(write_res, IsOk()); + + auto del_res = file_io_->DeleteFile(temp_filepath_); EXPECT_THAT(del_res, IsOk()); - del_res = file_io_->DeleteFile(tmpfile.string()); + del_res = file_io_->DeleteFile(temp_filepath_); EXPECT_THAT(del_res, IsError(ErrorKind::kIOError)); EXPECT_THAT(del_res, HasErrorMessage("Cannot delete file")); } diff --git a/test/metadata_io_test.cc b/test/metadata_io_test.cc new file mode 100644 index 000000000..cc98278b0 --- /dev/null +++ b/test/metadata_io_test.cc @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include "iceberg/arrow/arrow_fs_file_io.h" +#include "iceberg/file_io.h" +#include "iceberg/schema.h" +#include "iceberg/snapshot.h" +#include "iceberg/table_metadata.h" +#include "matchers.h" +#include "temp_file_test_base.h" + +namespace iceberg { + +class MetadataIOTest : public TempFileTestBase { + protected: + void SetUp() override { + TempFileTestBase::SetUp(); + io_ = std::make_shared( + std::make_shared<::arrow::fs::LocalFileSystem>()); + temp_filepath_ = CreateNewTempFilePathWithSuffix(".metadata.json"); + } + + std::shared_ptr io_; + std::string temp_filepath_; +}; + +TEST_F(MetadataIOTest, ReadWriteMetadata) { + std::vector schema_fields; + schema_fields.emplace_back(/*field_id=*/1, "x", std::make_shared(), + /*optional=*/false); + auto schema = std::make_shared(std::move(schema_fields), /*schema_id=*/1); + + TableMetadata metadata{.format_version = 1, + .table_uuid = "1234567890", + .location = "s3://bucket/path", + .last_sequence_number = 0, + .schemas = {schema}, + .current_schema_id = 1, + .default_spec_id = 0, + .last_partition_id = 0, + .properties = {{"key", "value"}}, + .current_snapshot_id = 3051729675574597004, + .snapshots = {std::make_shared(Snapshot{ + .snapshot_id = 3051729675574597004, + .sequence_number = 0, + .timestamp_ms = TimePointMsFromUnixMs(1515100955770).value(), + .manifest_list = "s3://a/b/1.avro", + .summary = {{"operation", "append"}}, + })}, + .default_sort_order_id = 0, + .next_row_id = 0}; + + EXPECT_THAT(TableMetadataUtil::Write(*io_, temp_filepath_, metadata), IsOk()); + + auto result = TableMetadataUtil::Read(*io_, temp_filepath_); + EXPECT_THAT(result, IsOk()); + + auto metadata_read = std::move(result.value()); + EXPECT_EQ(*metadata_read, metadata); +} + +} // namespace iceberg diff --git a/test/temp_file_test_base.h b/test/temp_file_test_base.h new file mode 100644 index 000000000..8e20e2ca5 --- /dev/null +++ b/test/temp_file_test_base.h @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace iceberg { + +/// A base class for tests that need to create and manage temporary files. +/// Provides utilities for creating platform-independent temporary files +/// and ensures proper cleanup after tests run. +/// +/// Usage Example: +/// ``` +/// class MyTest : public test::TempFileTestBase { +/// protected: +/// void SetUp() override { +/// // Always call base class SetUp first +/// TempFileTestBase::SetUp(); +/// +/// // Create test resources +/// my_temp_file_ = CreateNewTempFilePath(); +/// +/// // Additional setup... +/// } +/// +/// // Your test-specific members +/// std::string my_temp_file_; +/// }; +/// +/// TEST_F(MyTest, ExampleTest) { +/// // Use temporary files in your test +/// WriteContentToFile(my_temp_file_, "test content"); +/// +/// // Files will be automatically cleaned up in TearDown +/// } +/// ``` +/// +/// Notes: +/// - Always call TempFileTestBase::SetUp() in your derived class's SetUp() +/// - All files created using the provided methods will be automatically cleaned up +/// - You don't need to implement TearDown() unless you need additional cleanup +class TempFileTestBase : public ::testing::Test { + protected: + void SetUp() override { + // Base class setup is empty, but provided for derived classes to call + } + + void TearDown() override { + // Clean up all temporary files and directories created during the test + for (const auto& path : created_temp_files_) { + std::error_code ec; + if (std::filesystem::is_directory(path, ec)) { + std::filesystem::remove_all(path, ec); + } else { + std::filesystem::remove(path, ec); + } + } + } + + /// \brief Generates a unique temporary filepath that works across platforms + std::string GenerateUniqueTempFilePath() const { + std::filesystem::path temp_dir = std::filesystem::temp_directory_path(); + std::string file_name = + std::format("iceberg_test_{}_{}.tmp", TestInfo(), GenerateRandomString(8)); + return (temp_dir / file_name).string(); + } + + /// \brief Create a temporary filepath with the specified suffix/extension + std::string GenerateUniqueTempFilePathWithSuffix(const std::string& suffix) { + std::filesystem::path temp_dir = std::filesystem::temp_directory_path(); + std::string file_name = + std::format("iceberg_test_{}_{}{}", TestInfo(), GenerateRandomString(8), suffix); + return (temp_dir / file_name).string(); + } + + /// \brief Helper to generate a random alphanumeric string for unique filenames + std::string GenerateRandomString(size_t length) const { + const std::string_view chars = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dist(0, static_cast(chars.size() - 1)); + + std::string result; + result.reserve(length); + for (size_t i = 0; i < length; ++i) { + result += chars[dist(gen)]; + } + return result; + } + + /// \brief Get the test name for inclusion in the filename + std::string TestInfo() const { + if (const auto info = ::testing::UnitTest::GetInstance()->current_test_info(); info) { + return std::format("{}_{}", info->test_suite_name(), info->name()); + } + return "unknown_test"; + } + + /// \brief Creates a new temporary filepath and registers it for cleanup + std::string CreateNewTempFilePath() { + std::string filepath = GenerateUniqueTempFilePath(); + created_temp_files_.push_back(filepath); + return filepath; + } + + /// \brief Create a temporary filepath with the specified suffix and registers it for + /// cleanup + std::string CreateNewTempFilePathWithSuffix(const std::string& suffix) { + std::string filepath = GenerateUniqueTempFilePathWithSuffix(suffix); + created_temp_files_.push_back(filepath); + return filepath; + } + + /// \brief Creates a temporary directory and registers it for cleanup + std::string CreateTempDirectory() { + std::filesystem::path temp_dir = std::filesystem::temp_directory_path(); + std::string dir_name = + std::format("iceberg_test_dir_{}_{}", TestInfo(), GenerateRandomString(8)); + std::filesystem::path dir_path = temp_dir / dir_name; + + std::error_code ec; + std::filesystem::create_directory(dir_path, ec); + if (ec) { + throw std::runtime_error( + std::format("Failed to create temporary directory: {}", ec.message())); + } + + created_temp_files_.push_back(dir_path.string()); + return dir_path.string(); + } + + /// \brief Creates a file with the given content at the specified path + void WriteContentToFile(const std::string& path, const std::string& content) { + std::ofstream file(path, std::ios::binary); + if (!file) { + throw std::runtime_error(std::format("Failed to open file for writing: {}", path)); + } + file.write(content.data(), content.size()); + if (!file) { + throw std::runtime_error(std::format("Failed to write to file: {}", path)); + } + } + + /// \brief Creates a new temporary file with the given content + std::string CreateTempFileWithContent(const std::string& content) { + std::string path = CreateNewTempFilePath(); + WriteContentToFile(path, content); + return path; + } + + std::vector created_temp_files_; +}; + +} // namespace iceberg