diff --git a/src/iceberg/manifest_entry.cc b/src/iceberg/manifest_entry.cc index 4a9aaa2df..d387aad68 100644 --- a/src/iceberg/manifest_entry.cc +++ b/src/iceberg/manifest_entry.cc @@ -22,6 +22,7 @@ #include #include +#include "iceberg/schema.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" @@ -29,13 +30,16 @@ namespace iceberg { bool ManifestEntry::operator==(const ManifestEntry& other) const { return status == other.status && snapshot_id == other.snapshot_id && - sequence_number == other.sequence_number && - file_sequence_number == other.file_sequence_number && - (data_file && other.data_file && *data_file == *other.data_file) || - (!data_file && !other.data_file); + sequence_number == other.sequence_number && + file_sequence_number == other.file_sequence_number && + ((data_file && other.data_file && *data_file == *other.data_file) || + (!data_file && !other.data_file)); } std::shared_ptr DataFile::Type(std::shared_ptr partition_type) { + if (!partition_type) { + partition_type = PartitionSpec::Unpartitioned()->schema(); + } return std::make_shared(std::vector{ kContent, kFilePath, diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index 251e0d71d..e3b9becc7 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -368,15 +368,17 @@ Status ParseDataFile(const std::shared_ptr& data_file_schema, break; case 3: { if (view_of_file_field->storage_type != ArrowType::NANOARROW_TYPE_STRUCT) { - return InvalidManifest("Field:{} should be a list.", field_name); + return InvalidManifest("Field:{} should be a struct.", field_name); } - auto view_of_partition = view_of_file_field->children[0]; - for (size_t row_idx = 0; row_idx < view_of_partition->length; row_idx++) { - if (ArrowArrayViewIsNull(view_of_partition, row_idx)) { - break; + if (view_of_file_field->n_children > 0) { + auto view_of_partition = view_of_file_field->children[0]; + for (int64_t row_idx = 0; row_idx < view_of_partition->length; row_idx++) { + if (ArrowArrayViewIsNull(view_of_partition, row_idx)) { + break; + } + ICEBERG_RETURN_UNEXPECTED( + ParseLiteral(view_of_partition, row_idx, manifest_entries)); } - ICEBERG_RETURN_UNEXPECTED( - ParseLiteral(view_of_partition, row_idx, manifest_entries)); } } break; case 4: diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 47794e7c6..2b41950b4 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -46,8 +46,8 @@ PartitionSpec::PartitionSpec(std::shared_ptr schema, int32_t spec_id, const std::shared_ptr& PartitionSpec::Unpartitioned() { static const std::shared_ptr unpartitioned = std::make_shared( - /*schema=*/nullptr, kInitialSpecId, std::vector{}, - kLegacyPartitionDataIdStart - 1); + /*schema=*/std::make_shared(std::vector{}), kInitialSpecId, + std::vector{}, kLegacyPartitionDataIdStart - 1); return unpartitioned; } diff --git a/test/manifest_reader_test.cc b/test/manifest_reader_test.cc index 2fb20b73b..05224dbf8 100644 --- a/test/manifest_reader_test.cc +++ b/test/manifest_reader_test.cc @@ -19,13 +19,14 @@ #include "iceberg/manifest_reader.h" +#include + #include #include #include "iceberg/arrow/arrow_fs_file_io_internal.h" #include "iceberg/avro/avro_reader.h" #include "iceberg/avro/avro_register.h" -#include "iceberg/avro/avro_schema_util_internal.h" #include "iceberg/manifest_entry.h" #include "iceberg/schema.h" #include "temp_file_test_base.h" @@ -33,7 +34,7 @@ namespace iceberg { -class ManifestReaderTest : public TempFileTestBase { +class ManifestReaderV1Test : public TempFileTestBase { protected: static void SetUpTestSuite() { avro::AvroReader::Register(); } @@ -45,7 +46,7 @@ class ManifestReaderTest : public TempFileTestBase { avro::RegisterLogicalTypes(); } - std::vector prepare_manifest_entries() { + std::vector PrepareV1ManifestEntries() { std::vector manifest_entries; std::string test_dir_prefix = "/tmp/db/db/iceberg_test/data/"; std::vector paths = { @@ -102,7 +103,7 @@ class ManifestReaderTest : public TempFileTestBase { std::shared_ptr file_io_; }; -TEST_F(ManifestReaderTest, BasicTest) { +TEST_F(ManifestReaderV1Test, V1PartitionedBasicTest) { iceberg::SchemaField partition_field(1000, "order_ts_hour", iceberg::int32(), true); auto partition_schema = std::make_shared(std::vector({partition_field})); @@ -115,7 +116,88 @@ TEST_F(ManifestReaderTest, BasicTest) { auto read_result = manifest_reader->Entries(); ASSERT_EQ(read_result.has_value(), true) << read_result.error().message; - auto expected_entries = prepare_manifest_entries(); + auto expected_entries = PrepareV1ManifestEntries(); + ASSERT_EQ(read_result.value(), expected_entries); +} + +class ManifestReaderV2Test : public TempFileTestBase { + protected: + static void SetUpTestSuite() { avro::AvroReader::Register(); } + + void SetUp() override { + TempFileTestBase::SetUp(); + local_fs_ = std::make_shared<::arrow::fs::LocalFileSystem>(); + file_io_ = std::make_shared(local_fs_); + + avro::RegisterLogicalTypes(); + } + + std::vector PrepareV2NonPartitionedManifestEntries() { + std::vector manifest_entries; + std::string test_dir_prefix = "/tmp/db/db/v2_manifest_non_partitioned/data/"; + + std::vector paths = { + "00000-0-b0f98903-6d21-45fd-9e0b-afbd4963e365-0-00001.parquet"}; + + std::vector file_sizes = {1344}; + std::vector record_counts = {4}; + + std::vector>> lower_bounds = { + {{1, {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, + {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 'f', 'o', 'u', 'r'}}, + {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', '1'}}, + {4, {0xcd, 0xcc, 0xcc, 0xcc, 0xcc, 0xdc, 0x5e, 0x40}}}}; + + std::vector>> upper_bounds = { + {{1, {0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, + {2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 't', 'w', 'o'}}, + {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', '4'}}, + {4, {0x14, 0xae, 0x47, 0xe1, 0x7a, 0x8c, 0x7c, 0x40}}}}; + + manifest_entries.emplace_back( + ManifestEntry{.status = ManifestStatus::kAdded, + .snapshot_id = 679879563479918846LL, + .sequence_number = std::nullopt, + .file_sequence_number = std::nullopt, + .data_file = std::make_shared( + DataFile{.file_path = test_dir_prefix + paths[0], + .file_format = FileFormatType::kParquet, + .record_count = record_counts[0], + .file_size_in_bytes = file_sizes[0], + .column_sizes = {{1, 56}, {2, 73}, {3, 66}, {4, 67}}, + .value_counts = {{1, 4}, {2, 4}, {3, 4}, {4, 4}}, + .null_value_counts = {{1, 0}, {2, 0}, {3, 0}, {4, 0}}, + .nan_value_counts = {{4, 0}}, + .lower_bounds = lower_bounds[0], + .upper_bounds = upper_bounds[0], + .key_metadata = {}, + .split_offsets = {4}, + .equality_ids = {}, + .sort_order_id = 0, + .first_row_id = std::nullopt, + .referenced_data_file = std::nullopt, + .content_offset = std::nullopt, + .content_size_in_bytes = std::nullopt})}); + return manifest_entries; + } + + std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_; + std::shared_ptr file_io_; +}; + +TEST_F(ManifestReaderV2Test, V2NonPartitionedBasicTest) { + std::string path = GetResourcePath("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro"); + + auto manifest_reader_result = ManifestReader::MakeReader(path, file_io_, nullptr); + ASSERT_EQ(manifest_reader_result.has_value(), true) + << manifest_reader_result.error().message; + + auto manifest_reader = std::move(manifest_reader_result.value()); + auto read_result = manifest_reader->Entries(); + ASSERT_EQ(read_result.has_value(), true) << read_result.error().message; + ASSERT_EQ(read_result.value().size(), 1); + + auto expected_entries = PrepareV2NonPartitionedManifestEntries(); ASSERT_EQ(read_result.value(), expected_entries); } diff --git a/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro b/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro new file mode 100644 index 000000000..f8e6c1c41 Binary files /dev/null and b/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro differ