diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index a20621cf1..b8a1c098e 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -23,7 +23,5 @@ dist/** .git/** requirements.txt test/resources/** -*.avro *.json -*.parquet src/iceberg/util/murmurhash3_internal.* diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 21ccd4d66..af3dfa0fb 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -126,9 +126,7 @@ if(ICEBERG_BUILD_BUNDLE) avro_test.cc avro_schema_test.cc avro_stream_test.cc - manifest_list_reader_writer_test.cc manifest_list_versions_test.cc - manifest_reader_writer_test.cc manifest_writer_versions_test.cc test_common.cc) diff --git a/src/iceberg/test/manifest_list_reader_writer_test.cc b/src/iceberg/test/manifest_list_reader_writer_test.cc deleted file mode 100644 index ee6c7d9fe..000000000 --- a/src/iceberg/test/manifest_list_reader_writer_test.cc +++ /dev/null @@ -1,391 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include - -#include "iceberg/arrow/arrow_fs_file_io_internal.h" -#include "iceberg/avro/avro_register.h" -#include "iceberg/expression/literal.h" -#include "iceberg/manifest/manifest_list.h" -#include "iceberg/manifest/manifest_reader.h" -#include "iceberg/manifest/manifest_writer.h" -#include "iceberg/test/matchers.h" -#include "iceberg/test/temp_file_test_base.h" -#include "iceberg/test/test_common.h" - -namespace iceberg { - -class ManifestListReaderWriterTestBase : public TempFileTestBase { - protected: - static void SetUpTestSuite() { avro::RegisterAll(); } - - void SetUp() override { - TempFileTestBase::SetUp(); - local_fs_ = std::make_shared<::arrow::fs::LocalFileSystem>(); - file_io_ = std::make_shared(local_fs_); - } - - void TestManifestListReading(const std::string& resource_name, - const std::vector& expected_manifest_list) { - std::string path = GetResourcePath(resource_name); - TestManifestListReadingByPath(path, expected_manifest_list); - } - - void TestManifestListReadingByPath( - const std::string& path, const std::vector& expected_manifest_list) { - auto manifest_reader_result = ManifestListReader::Make(path, file_io_); - ASSERT_EQ(manifest_reader_result.has_value(), true); - - auto manifest_reader = std::move(manifest_reader_result.value()); - auto read_result = manifest_reader->Files(); - ASSERT_EQ(read_result.has_value(), true); - ASSERT_EQ(read_result.value().size(), expected_manifest_list.size()); - ASSERT_EQ(read_result.value(), expected_manifest_list); - } - - void TestNonPartitionedManifests(const std::vector& manifest_files) { - for (const auto& manifest : manifest_files) { - ASSERT_EQ(manifest.partition_spec_id, 0); - ASSERT_TRUE(manifest.partitions.empty()); - ASSERT_EQ(manifest.content, ManifestContent::kData); - } - } - - std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_; - std::shared_ptr file_io_; -}; - -class ManifestListReaderWriterV1Test : public ManifestListReaderWriterTestBase { - protected: - std::vector PreparePartitionedTestData() { - std::vector paths = { - "iceberg-warehouse/db/v1_partition_test/metadata/" - "eafd2972-f58e-4185-9237-6378f564787e-m1.avro", - "iceberg-warehouse/db/v1_partition_test/metadata/" - "eafd2972-f58e-4185-9237-6378f564787e-m0.avro"}; - std::vector file_size = {6185, 6113}; - std::vector snapshot_id = {7532614258660258098, 7532614258660258098}; - - return { - {.manifest_path = paths[0], - .manifest_length = file_size[0], - .partition_spec_id = 0, - .added_snapshot_id = snapshot_id[0], - .added_files_count = 4, - .existing_files_count = 0, - .deleted_files_count = 0, - .added_rows_count = 6, - .existing_rows_count = 0, - .deleted_rows_count = 0, - .partitions = {{.contains_null = false, - .contains_nan = false, - .lower_bound = Literal::String("2022-02-22").Serialize().value(), - .upper_bound = - Literal::String("2022-2-23").Serialize().value()}}}, - - {.manifest_path = paths[1], - .manifest_length = file_size[1], - .partition_spec_id = 0, - .added_snapshot_id = snapshot_id[1], - .added_files_count = 0, - .existing_files_count = 0, - .deleted_files_count = 2, - .added_rows_count = 0, - .existing_rows_count = 0, - .deleted_rows_count = 6, - .partitions = { - {.contains_null = false, - .contains_nan = false, - .lower_bound = Literal::String("2022-2-22").Serialize().value(), - .upper_bound = Literal::String("2022-2-23").Serialize().value()}}}}; - } - - std::vector PrepareComplexTypeTestData() { - std::vector paths = { - "iceberg-warehouse/db/v1_type_test/metadata/" - "aeffe099-3bac-4011-bc17-5875210d8dc0-m1.avro", - "iceberg-warehouse/db/v1_type_test/metadata/" - "aeffe099-3bac-4011-bc17-5875210d8dc0-m0.avro"}; - std::vector file_size = {6498, 6513}; - std::vector snapshot_id = {4134160420377642835, 4134160420377642835}; - - return {{.manifest_path = paths[0], - .manifest_length = file_size[0], - .partition_spec_id = 0, - .added_snapshot_id = snapshot_id[0], - .added_files_count = 1, - .existing_files_count = 0, - .deleted_files_count = 0, - .added_rows_count = 2, - .existing_rows_count = 0, - .deleted_rows_count = 0}, - - {.manifest_path = paths[1], - .manifest_length = file_size[1], - .partition_spec_id = 0, - .added_snapshot_id = snapshot_id[1], - .added_files_count = 0, - .existing_files_count = 0, - .deleted_files_count = 1, - .added_rows_count = 0, - .existing_rows_count = 0, - .deleted_rows_count = 3}}; - } - - std::vector PrepareComplexPartitionedTestData() { - std::vector paths = { - "iceberg-warehouse/db2/v1_complex_partition_test/metadata/" - "5d690750-8fb4-4cd1-8ae7-85c7b39abe14-m0.avro", - "iceberg-warehouse/db2/v1_complex_partition_test/metadata/" - "5d690750-8fb4-4cd1-8ae7-85c7b39abe14-m1.avro"}; - std::vector file_size = {6402, 6318}; - std::vector snapshot_id = {7522296285847100621, 7522296285847100621}; - - std::vector> lower_bounds = { - {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}, - {0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}, - {0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}; - - std::vector> upper_bounds = { - {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x34}, - {0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}, - {0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}; - - return {{.manifest_path = paths[0], - .manifest_length = file_size[0], - .partition_spec_id = 0, - .added_snapshot_id = snapshot_id[0], - .added_files_count = 0, - .existing_files_count = 3, - .deleted_files_count = 1, - .added_rows_count = 0, - .existing_rows_count = 4, - .deleted_rows_count = 2, - .partitions = {{.contains_null = false, - .contains_nan = false, - .lower_bound = lower_bounds[0], - .upper_bound = upper_bounds[0]}, - {.contains_null = false, - .contains_nan = false, - .lower_bound = lower_bounds[1], - .upper_bound = upper_bounds[1]}}}, - - {.manifest_path = paths[1], - .manifest_length = file_size[1], - .partition_spec_id = 0, - .added_snapshot_id = snapshot_id[1], - .added_files_count = 0, - .existing_files_count = 1, - .deleted_files_count = 1, - .added_rows_count = 0, - .existing_rows_count = 1, - .deleted_rows_count = 1, - .partitions = {{.contains_null = false, - .contains_nan = false, - .lower_bound = lower_bounds[2], - .upper_bound = upper_bounds[2]}, - {.contains_null = false, - .contains_nan = false, - .lower_bound = lower_bounds[3], - .upper_bound = upper_bounds[3]}}}}; - } - - void TestWriteManifestList(const std::string& manifest_list_path, - const std::vector& manifest_files) { - auto result = ManifestListWriter::MakeV1Writer(1, 0, manifest_list_path, file_io_); - ASSERT_TRUE(result.has_value()) << result.error().message; - auto writer = std::move(result.value()); - auto status = writer->AddAll(manifest_files); - EXPECT_THAT(status, IsOk()); - status = writer->Close(); - EXPECT_THAT(status, IsOk()); - } -}; - -class ManifestListReaderWriterV2Test : public ManifestListReaderWriterTestBase { - protected: - std::vector PreparePartitionedTestData() { - std::vector manifest_files; - std::string test_dir_prefix = "/tmp/db/db/iceberg_test/metadata/"; - std::vector paths = {"2bccd69e-d642-4816-bba0-261cd9bd0d93-m0.avro", - "9b6ffacd-ef10-4abf-a89c-01c733696796-m0.avro", - "2541e6b5-4923-4bd5-886d-72c6f7228400-m0.avro", - "3118c801-d2e0-4df6-8c7a-7d4eaade32f8-m0.avro"}; - std::vector file_size = {7433, 7431, 7433, 7431}; - std::vector snapshot_id = {7412193043800610213, 5485972788975780755, - 1679468743751242972, 1579605567338877265}; - std::vector> bounds = {{'x', ';', 0x07, 0x00}, - {'(', 0x19, 0x07, 0x00}, - {0xd0, 0xd4, 0x06, 0x00}, - {0xb8, 0xd4, 0x06, 0x00}}; - for (int i = 0; i < 4; ++i) { - ManifestFile manifest_file; - manifest_file.manifest_path = test_dir_prefix + paths[i]; - manifest_file.manifest_length = file_size[i]; - manifest_file.partition_spec_id = 0; - manifest_file.content = ManifestContent::kData; - manifest_file.sequence_number = 4 - i; - manifest_file.min_sequence_number = 4 - i; - manifest_file.added_snapshot_id = snapshot_id[i]; - manifest_file.added_files_count = 1; - manifest_file.existing_files_count = 0; - manifest_file.deleted_files_count = 0; - manifest_file.added_rows_count = 1; - manifest_file.existing_rows_count = 0; - manifest_file.deleted_rows_count = 0; - PartitionFieldSummary partition; - partition.contains_null = false; - partition.contains_nan = false; - partition.lower_bound = bounds[i]; - partition.upper_bound = bounds[i]; - manifest_file.partitions.emplace_back(partition); - manifest_files.emplace_back(manifest_file); - } - return manifest_files; - } - - std::vector PrepareNonPartitionedTestData() { - std::vector manifest_files; - std::string test_dir_prefix = "/tmp/db/db/v2_non_partitioned_test/metadata/"; - - std::vector paths = {"ccb6dbcb-0611-48da-be68-bd506ea63188-m0.avro", - "b89a10c9-a7a8-4526-99c5-5587a4ea7527-m0.avro", - "a74d20fa-c800-4706-9ddb-66be15a5ecb0-m0.avro", - "ae7d5fce-7245-4335-9b57-bc598c595c84-m0.avro"}; - - std::vector file_size = {7169, 7170, 7169, 7170}; - - std::vector snapshot_id = {251167482216575399, 4248697313956014690, - 281757490425433194, 5521202581490753283}; - - for (int i = 0; i < 4; ++i) { - ManifestFile manifest_file; - manifest_file.manifest_path = test_dir_prefix + paths[i]; - manifest_file.manifest_length = file_size[i]; - manifest_file.partition_spec_id = 0; - manifest_file.content = ManifestContent::kData; - manifest_file.sequence_number = 4 - i; - manifest_file.min_sequence_number = 4 - i; - manifest_file.added_snapshot_id = snapshot_id[i]; - manifest_file.added_files_count = 1; - manifest_file.existing_files_count = 0; - manifest_file.deleted_files_count = 0; - manifest_file.added_rows_count = 1; - manifest_file.existing_rows_count = 0; - manifest_file.deleted_rows_count = 0; - // Note: no partitions for non-partitioned test - manifest_files.emplace_back(manifest_file); - } - return manifest_files; - } - - void TestWriteManifestList(const std::string& manifest_list_path, - const std::vector& manifest_files) { - auto result = ManifestListWriter::MakeV2Writer(1, 0, 4, manifest_list_path, file_io_); - ASSERT_TRUE(result.has_value()) << result.error().message; - auto writer = std::move(result.value()); - auto status = writer->AddAll(manifest_files); - EXPECT_THAT(status, IsOk()); - status = writer->Close(); - EXPECT_THAT(status, IsOk()); - } -}; - -// V1 Tests -TEST_F(ManifestListReaderWriterV1Test, PartitionedTest) { - auto expected_manifest_list = PreparePartitionedTestData(); - TestManifestListReading( - "snap-7532614258660258098-1-eafd2972-f58e-4185-9237-6378f564787e.avro", - expected_manifest_list); -} - -TEST_F(ManifestListReaderWriterV1Test, ComplexTypeTest) { - auto expected_manifest_list = PrepareComplexTypeTestData(); - TestManifestListReading( - "snap-4134160420377642835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro", - expected_manifest_list); -} - -TEST_F(ManifestListReaderWriterV1Test, ComplexPartitionedTest) { - auto expected_manifest_list = PrepareComplexPartitionedTestData(); - TestManifestListReading( - "snap-7522296285847100621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro", - expected_manifest_list); -} - -TEST_F(ManifestListReaderWriterV1Test, WritePartitionedTest) { - auto expected_manifest_list = PreparePartitionedTestData(); - auto write_manifest_list_path = CreateNewTempFilePath(); - TestWriteManifestList(write_manifest_list_path, expected_manifest_list); - TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); -} - -TEST_F(ManifestListReaderWriterV1Test, WriteComplexTypeTest) { - auto expected_manifest_list = PrepareComplexTypeTestData(); - auto write_manifest_list_path = CreateNewTempFilePath(); - TestWriteManifestList(write_manifest_list_path, expected_manifest_list); - TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); -} - -TEST_F(ManifestListReaderWriterV1Test, WriteComplexPartitionedTest) { - auto expected_manifest_list = PrepareComplexPartitionedTestData(); - auto write_manifest_list_path = CreateNewTempFilePath(); - TestWriteManifestList(write_manifest_list_path, expected_manifest_list); - TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); -} - -// V2 Tests -TEST_F(ManifestListReaderWriterV2Test, PartitionedTest) { - auto expected_manifest_list = PreparePartitionedTestData(); - TestManifestListReading( - "snap-7412193043800610213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro", - expected_manifest_list); -} - -TEST_F(ManifestListReaderWriterV2Test, NonPartitionedTest) { - auto expected_manifest_list = PrepareNonPartitionedTestData(); - TestManifestListReading( - "snap-251167482216575399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro", - expected_manifest_list); - - // Additional verification: ensure all manifests are truly non-partitioned - TestNonPartitionedManifests(expected_manifest_list); -} - -TEST_F(ManifestListReaderWriterV2Test, WritePartitionedTest) { - auto expected_manifest_list = PreparePartitionedTestData(); - auto write_manifest_list_path = CreateNewTempFilePath(); - TestWriteManifestList(write_manifest_list_path, expected_manifest_list); - TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); -} - -TEST_F(ManifestListReaderWriterV2Test, WriteNonPartitionedTest) { - auto expected_manifest_list = PrepareNonPartitionedTestData(); - auto write_manifest_list_path = CreateNewTempFilePath(); - TestWriteManifestList(write_manifest_list_path, expected_manifest_list); - TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); - - // Additional verification: ensure all manifests are truly non-partitioned - TestNonPartitionedManifests(expected_manifest_list); -} - -} // namespace iceberg diff --git a/src/iceberg/test/manifest_reader_writer_test.cc b/src/iceberg/test/manifest_reader_writer_test.cc deleted file mode 100644 index b3daaf94e..000000000 --- a/src/iceberg/test/manifest_reader_writer_test.cc +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include - -#include -#include - -#include "iceberg/arrow/arrow_fs_file_io_internal.h" -#include "iceberg/avro/avro_register.h" -#include "iceberg/manifest/manifest_entry.h" -#include "iceberg/manifest/manifest_list.h" -#include "iceberg/manifest/manifest_reader.h" -#include "iceberg/manifest/manifest_writer.h" -#include "iceberg/partition_spec.h" -#include "iceberg/schema.h" -#include "iceberg/test/matchers.h" -#include "iceberg/test/temp_file_test_base.h" -#include "iceberg/test/test_common.h" -#include "iceberg/transform.h" -#include "iceberg/type.h" - -namespace iceberg { - -class ManifestReaderWriterTestBase : public TempFileTestBase { - protected: - static void SetUpTestSuite() { avro::RegisterAll(); } - - void SetUp() override { - TempFileTestBase::SetUp(); - local_fs_ = std::make_shared<::arrow::fs::LocalFileSystem>(); - file_io_ = std::make_shared(local_fs_); - } - - void TestManifestReading(const std::string& resource_name, - const std::vector& expected_entries, - std::shared_ptr partition_schema = nullptr, - std::optional snapshot_id = std::nullopt) { - std::string path = GetResourcePath(resource_name); - TestManifestReadingByPath(path, expected_entries, partition_schema, snapshot_id); - } - - void TestManifestReadingByPath(const std::string& path, - const std::vector& expected_entries, - std::shared_ptr partition_schema = nullptr, - std::optional snapshot_id = std::nullopt) { - auto manifest_reader_result = ManifestReader::Make(path, file_io_, partition_schema); - ASSERT_TRUE(manifest_reader_result.has_value()) - << manifest_reader_result.error().message; - - auto manifest_reader = std::move(manifest_reader_result.value()); - auto read_result = manifest_reader->Entries(); - ASSERT_TRUE(read_result.has_value()) << read_result.error().message; - ASSERT_EQ(read_result.value().size(), expected_entries.size()); - ASSERT_EQ(read_result.value(), expected_entries); - } - - void TestManifestReadingWithManifestFile( - const ManifestFile& manifest_file, - const std::vector& expected_entries, - std::shared_ptr partition_schema = nullptr) { - auto manifest_reader_result = - ManifestReader::Make(manifest_file, file_io_, partition_schema); - ASSERT_TRUE(manifest_reader_result.has_value()) - << manifest_reader_result.error().message; - - auto manifest_reader = std::move(manifest_reader_result.value()); - auto read_result = manifest_reader->Entries(); - ASSERT_TRUE(read_result.has_value()) << read_result.error().message; - ASSERT_EQ(read_result.value().size(), expected_entries.size()); - ASSERT_EQ(read_result.value(), expected_entries); - } - - std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_; - std::shared_ptr file_io_; -}; - -class ManifestV1Test : public ManifestReaderWriterTestBase { - protected: - std::vector PreparePartitionedTestData() { - std::vector manifest_entries; - std::string test_dir_prefix = "/tmp/db/db/iceberg_test/data/"; - std::vector paths = { - "order_ts_hour=2021-01-27-00/" - "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00001.parquet", - "order_ts_hour=2024-01-27-00/" - "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00002.parquet", - "order_ts_hour=2023-01-26-00/" - "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00003.parquet", - "order_ts_hour=2021-01-26-00/" - "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00004.parquet"}; - std::vector partitions = {447696, 473976, 465192, 447672}; - - // Note: The precision and scale for decimal literals are chosen arbitrarily here, - // since the lower and upper bounds for decimal values are stored as unscaled int128_t - // values in manifest files. - std::vector>> bounds = { - {{1, Literal::Long(1234).Serialize().value()}, - {2, Literal::Long(5678).Serialize().value()}, - {3, Literal::Decimal(4834, 10, 2).Serialize().value()}, - {4, Literal::Timestamp(1611706223000000LL).Serialize().value()}}, - - {{1, Literal::Long(1234).Serialize().value()}, - {2, Literal::Long(5678).Serialize().value()}, - {3, Literal::Decimal(4835, 10, 2).Serialize().value()}, - {4, Literal::Timestamp(1706314223000000LL).Serialize().value()}}, - - {{1, Literal::Long(123).Serialize().value()}, - {2, Literal::Long(456).Serialize().value()}, - {3, Literal::Decimal(3618, 10, 2).Serialize().value()}, - {4, Literal::Timestamp(1674691823000000LL).Serialize().value()}}, - - {{1, Literal::Long(123).Serialize().value()}, - {2, Literal::Long(456).Serialize().value()}, - {3, Literal::Decimal(3617, 10, 2).Serialize().value()}, - {4, Literal::Timestamp(1611619823000000LL).Serialize().value()}}, - }; - - for (int i = 0; i < 4; ++i) { - ManifestEntry entry; - entry.status = ManifestStatus::kAdded; - entry.snapshot_id = 6387266376565973956; - entry.data_file = std::make_shared(); - entry.data_file->file_path = test_dir_prefix + paths[i]; - entry.data_file->file_format = FileFormatType::kParquet; - entry.data_file->partition.AddValue(Literal::Int(partitions[i])); - entry.data_file->record_count = 1; - entry.data_file->file_size_in_bytes = 1375; - entry.data_file->column_sizes = {{1, 49}, {2, 49}, {3, 49}, {4, 49}}; - entry.data_file->value_counts = {{1, 1}, {2, 1}, {3, 1}, {4, 1}}; - entry.data_file->null_value_counts = {{1, 0}, {2, 0}, {3, 0}, {4, 0}}; - entry.data_file->split_offsets = {4}; - entry.data_file->sort_order_id = 0; - entry.data_file->upper_bounds = bounds[i]; - entry.data_file->lower_bounds = bounds[i]; - manifest_entries.emplace_back(entry); - } - return manifest_entries; - } - - void TestWriteManifest(int64_t snapshot_id, const std::string& manifest_list_path, - std::shared_ptr partition_spec, - const std::vector& manifest_entries, - std::shared_ptr table_schema) { - auto result = - ManifestWriter::MakeV1Writer(snapshot_id, manifest_list_path, file_io_, - std::move(partition_spec), std::move(table_schema)); - ASSERT_TRUE(result.has_value()) << result.error().message; - auto writer = std::move(result.value()); - auto status = writer->AddAll(manifest_entries); - EXPECT_THAT(status, IsOk()); - status = writer->Close(); - EXPECT_THAT(status, IsOk()); - } -}; - -TEST_F(ManifestV1Test, ReadPartitionedTest) { - // TODO(xiao.dong) we need to add more cases for different partition types - SchemaField partition_field(1000, "order_ts_hour", int32(), true); - auto partition_schema = - std::make_shared(std::vector({partition_field})); - auto expected_entries = PreparePartitionedTestData(); - TestManifestReading("56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro", expected_entries, - partition_schema); -} - -TEST_F(ManifestV1Test, WritePartitionedTest) { - SchemaField table_field(1, "order_ts_hour_source", int32(), true); - SchemaField partition_field(1000, "order_ts_hour", int32(), true); - auto table_schema = std::make_shared(std::vector({table_field})); - auto partition_schema = - std::make_shared(std::vector({partition_field})); - auto identity_transform = Transform::Identity(); - std::vector fields{ - PartitionField(1, 1000, "order_ts_hour", identity_transform)}; - ICEBERG_UNWRAP_OR_FAIL(std::shared_ptr partition_spec, - PartitionSpec::Make(*table_schema, 1, fields, false)); - - auto expected_entries = PreparePartitionedTestData(); - auto write_manifest_path = CreateNewTempFilePath(); - TestWriteManifest(1, write_manifest_path, partition_spec, expected_entries, - table_schema); - TestManifestReadingByPath(write_manifest_path, expected_entries, partition_schema, 1); -} - -class ManifestV2Test : public ManifestReaderWriterTestBase { - protected: - std::vector CreateV2TestData( - std::optional sequence_number = std::nullopt, - std::optional partition_spec_id = std::nullopt) { - std::vector manifest_entries; - std::string test_dir_prefix = "/tmp/db/db/v2_manifest_non_partitioned/data/"; - - std::vector paths = { - "00000-0-b0f98903-6d21-45fd-9e0b-afbd4963e365-0-00001.parquet"}; - - std::vector file_sizes = {1344}; - std::vector record_counts = {4}; - - std::vector>> lower_bounds = { - {{1, Literal::Long(1).Serialize().value()}, - {2, Literal::String("record_four").Serialize().value()}, - {3, Literal::String("data_content_1").Serialize().value()}, - {4, Literal::Double(123.45).Serialize().value()}}}; - - std::vector>> upper_bounds = { - {{1, Literal::Long(4).Serialize().value()}, - {2, Literal::String("record_two").Serialize().value()}, - {3, Literal::String("data_content_4").Serialize().value()}, - {4, Literal::Double(456.78).Serialize().value()}}}; - - DataFile data_file{.file_path = test_dir_prefix + paths[0], - .file_format = FileFormatType::kParquet, - .record_count = record_counts[0], - .file_size_in_bytes = file_sizes[0], - .column_sizes = {{1, 56}, {2, 73}, {3, 66}, {4, 67}}, - .value_counts = {{1, 4}, {2, 4}, {3, 4}, {4, 4}}, - .null_value_counts = {{1, 0}, {2, 0}, {3, 0}, {4, 0}}, - .nan_value_counts = {{4, 0}}, - .lower_bounds = lower_bounds[0], - .upper_bounds = upper_bounds[0], - .key_metadata = {}, - .split_offsets = {4}, - .equality_ids = {}, - .sort_order_id = 0, - .first_row_id = std::nullopt, - .referenced_data_file = std::nullopt, - .content_offset = std::nullopt, - .content_size_in_bytes = std::nullopt}; - - if (partition_spec_id.has_value()) { - data_file.partition_spec_id = partition_spec_id.value(); - } - - manifest_entries.emplace_back( - ManifestEntry{.status = ManifestStatus::kAdded, - .snapshot_id = 679879563479918846LL, - .sequence_number = sequence_number, - .file_sequence_number = sequence_number, - .data_file = std::make_shared(data_file)}); - return manifest_entries; - } - - std::vector PrepareNonPartitionedTestData() { - return CreateV2TestData(); - } - - std::vector PrepareMetadataInheritanceTestData() { - return CreateV2TestData(/*sequence_number=*/15, /*partition_spec_id*/ 12); - } - - void TestWriteManifest(int64_t snapshot_id, const std::string& manifest_list_path, - std::shared_ptr partition_spec, - const std::vector& manifest_entries, - std::shared_ptr table_schema) { - auto result = ManifestWriter::MakeV2Writer( - snapshot_id, manifest_list_path, file_io_, std::move(partition_spec), - std::move(table_schema), ManifestContent::kData); - ASSERT_TRUE(result.has_value()) << result.error().message; - auto writer = std::move(result.value()); - auto status = writer->AddAll(manifest_entries); - EXPECT_THAT(status, IsOk()); - status = writer->Close(); - EXPECT_THAT(status, IsOk()); - } -}; - -TEST_F(ManifestV2Test, ReadNonPartitionedTest) { - auto expected_entries = PrepareNonPartitionedTestData(); - TestManifestReading("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro", expected_entries); -} - -TEST_F(ManifestV2Test, ReadMetadataInheritanceTest) { - std::string path = GetResourcePath("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro"); - ManifestFile manifest_file{ - .manifest_path = path, - .manifest_length = 100, - .partition_spec_id = 12, - .content = ManifestContent::kData, - .sequence_number = 15, - .added_snapshot_id = 679879563479918846LL, - }; - auto expected_entries = PrepareMetadataInheritanceTestData(); - TestManifestReadingWithManifestFile(manifest_file, expected_entries); -} - -TEST_F(ManifestV2Test, WriteNonPartitionedTest) { - SchemaField table_field(1, "order_ts_hour_source", int32(), true); - SchemaField partition_field(1000, "order_ts_hour", int32(), true); - auto table_schema = std::make_shared(std::vector({table_field})); - auto expected_entries = PrepareNonPartitionedTestData(); - auto write_manifest_path = CreateNewTempFilePath(); - TestWriteManifest(679879563479918846LL, write_manifest_path, - PartitionSpec::Unpartitioned(), expected_entries, table_schema); - TestManifestReadingByPath(write_manifest_path, expected_entries); -} - -TEST_F(ManifestV2Test, WriteInheritancePartitionedTest) { - SchemaField table_field(1, "order_ts_hour_source", int32(), true); - SchemaField partition_field(1000, "order_ts_hour", int32(), true); - auto table_schema = std::make_shared(std::vector({table_field})); - auto expected_entries = PrepareMetadataInheritanceTestData(); - auto write_manifest_path = CreateNewTempFilePath(); - TestWriteManifest(679879563479918846LL, write_manifest_path, - PartitionSpec::Unpartitioned(), expected_entries, table_schema); - ManifestFile manifest_file{ - .manifest_path = write_manifest_path, - .manifest_length = 100, - .partition_spec_id = 12, - .content = ManifestContent::kData, - .sequence_number = 15, - .added_snapshot_id = 679879563479918846LL, - }; - TestManifestReadingWithManifestFile(manifest_file, expected_entries); -} - -} // namespace iceberg diff --git a/src/iceberg/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro b/src/iceberg/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro deleted file mode 100644 index f8e6c1c41..000000000 Binary files a/src/iceberg/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro and /dev/null differ diff --git a/src/iceberg/test/resources/56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro b/src/iceberg/test/resources/56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro deleted file mode 100644 index c671dfdf0..000000000 Binary files a/src/iceberg/test/resources/56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro and /dev/null differ diff --git a/src/iceberg/test/resources/snap-251167482216575399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro b/src/iceberg/test/resources/snap-251167482216575399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro deleted file mode 100644 index d8621c6b3..000000000 Binary files a/src/iceberg/test/resources/snap-251167482216575399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro and /dev/null differ diff --git a/src/iceberg/test/resources/snap-4134160420377642835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro b/src/iceberg/test/resources/snap-4134160420377642835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro deleted file mode 100644 index 29584b8ce..000000000 Binary files a/src/iceberg/test/resources/snap-4134160420377642835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro and /dev/null differ diff --git a/src/iceberg/test/resources/snap-7412193043800610213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro b/src/iceberg/test/resources/snap-7412193043800610213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro deleted file mode 100644 index c22993917..000000000 Binary files a/src/iceberg/test/resources/snap-7412193043800610213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro and /dev/null differ diff --git a/src/iceberg/test/resources/snap-7522296285847100621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro b/src/iceberg/test/resources/snap-7522296285847100621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro deleted file mode 100644 index 590edc1f8..000000000 Binary files a/src/iceberg/test/resources/snap-7522296285847100621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro and /dev/null differ diff --git a/src/iceberg/test/resources/snap-7532614258660258098-1-eafd2972-f58e-4185-9237-6378f564787e.avro b/src/iceberg/test/resources/snap-7532614258660258098-1-eafd2972-f58e-4185-9237-6378f564787e.avro deleted file mode 100644 index 4fba684a2..000000000 Binary files a/src/iceberg/test/resources/snap-7532614258660258098-1-eafd2972-f58e-4185-9237-6378f564787e.avro and /dev/null differ