Skip to content

Commit 5bffdf6

Browse files
dongxiao1198xiao.dong
andauthored
feat: support operator== for Literal/Manifest/ManifestList (#147)
add operator== for manifest and manifest list --------- Co-authored-by: xiao.dong <[email protected]>
1 parent 0279fdc commit 5bffdf6

File tree

6 files changed

+79
-80
lines changed

6 files changed

+79
-80
lines changed

src/iceberg/expression/literal.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,8 @@ std::strong_ordering CompareFloat(T lhs, T rhs) {
179179
return lhs_is_negative <=> rhs_is_negative;
180180
}
181181

182+
bool Literal::operator==(const Literal& other) const { return (*this <=> other) == 0; }
183+
182184
// Three-way comparison operator
183185
std::partial_ordering Literal::operator<=>(const Literal& other) const {
184186
// If types are different, comparison is unordered

src/iceberg/expression/literal.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ class ICEBERG_EXPORT Literal {
105105
/// was not valid
106106
Result<Literal> CastTo(const std::shared_ptr<PrimitiveType>& target_type) const;
107107

108+
bool operator==(const Literal& other) const;
109+
108110
/// \brief Compare two PrimitiveLiterals. Both literals must have the same type
109111
/// and should not be AboveMax or BelowMin.
110112
std::partial_ordering operator<=>(const Literal& other) const;

src/iceberg/manifest_entry.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,14 @@
2727

2828
namespace iceberg {
2929

30+
bool ManifestEntry::operator==(const ManifestEntry& other) const {
31+
return status == other.status && snapshot_id == other.snapshot_id &&
32+
sequence_number == other.sequence_number &&
33+
file_sequence_number == other.file_sequence_number &&
34+
(data_file && other.data_file && *data_file == *other.data_file) ||
35+
(!data_file && !other.data_file);
36+
}
37+
3038
std::shared_ptr<StructType> DataFile::Type(std::shared_ptr<StructType> partition_type) {
3139
return std::make_shared<StructType>(std::vector<SchemaField>{
3240
kContent,

src/iceberg/manifest_entry.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "iceberg/expression/literal.h"
3030
#include "iceberg/file_format.h"
3131
#include "iceberg/iceberg_export.h"
32+
#include "iceberg/partition_spec.h"
3233
#include "iceberg/result.h"
3334
#include "iceberg/schema_field.h"
3435
#include "iceberg/type.h"
@@ -68,13 +69,13 @@ struct ICEBERG_EXPORT DataFile {
6869
/// Field id: 134
6970
/// Type of content stored by the data file: data, equality deletes, or position
7071
/// deletes (all v1 files are data files)
71-
Content content;
72+
Content content = Content::kData;
7273
/// Field id: 100
7374
/// Full URI for the file with FS scheme
7475
std::string file_path;
7576
/// Field id: 101
7677
/// File format type, avro, orc, parquet, or puffin
77-
FileFormatType file_format;
78+
FileFormatType file_format = FileFormatType::kParquet;
7879
/// Field id: 102
7980
/// Partition data tuple, schema based on the partition spec output using partition
8081
/// field ids
@@ -146,7 +147,7 @@ struct ICEBERG_EXPORT DataFile {
146147
std::optional<int32_t> sort_order_id;
147148
/// This field is not included in spec, so it is not serialized into the manifest file.
148149
/// It is just store in memory representation used in process.
149-
int32_t partition_spec_id;
150+
int32_t partition_spec_id = PartitionSpec::kInitialSpecId;
150151
/// Field id: 142
151152
/// The _row_id for the first row in the data file.
152153
///
@@ -261,6 +262,8 @@ struct ICEBERG_EXPORT DataFile {
261262
SchemaField::MakeOptional(145, "content_size_in_bytes", iceberg::int64(),
262263
"The length of referenced content stored in the file");
263264

265+
bool operator==(const DataFile& other) const = default;
266+
264267
static std::shared_ptr<StructType> Type(std::shared_ptr<StructType> partition_type);
265268
};
266269

@@ -272,7 +275,7 @@ struct ICEBERG_EXPORT ManifestEntry {
272275
/// Field id: 0
273276
/// Used to track additions and deletions. Deletes are informational only and not used
274277
/// in scans.
275-
ManifestStatus status;
278+
ManifestStatus status = ManifestStatus::kAdded;
276279
/// Field id: 1
277280
/// Snapshot id where the file was added, or deleted if status is 2. Inherited when
278281
/// null.
@@ -297,6 +300,8 @@ struct ICEBERG_EXPORT ManifestEntry {
297300
inline static const SchemaField kFileSequenceNumber =
298301
SchemaField::MakeOptional(4, "file_sequence_number", iceberg::int64());
299302

303+
bool operator==(const ManifestEntry& other) const;
304+
300305
static std::shared_ptr<StructType> TypeFromPartitionType(
301306
std::shared_ptr<StructType> partition_type);
302307
static std::shared_ptr<StructType> TypeFromDataFileType(

src/iceberg/manifest_list.h

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,11 @@
2828
#include <utility>
2929

3030
#include "iceberg/iceberg_export.h"
31+
#include "iceberg/partition_spec.h"
3132
#include "iceberg/result.h"
3233
#include "iceberg/schema_field.h"
34+
#include "iceberg/snapshot.h"
35+
#include "iceberg/table_metadata.h"
3336
#include "iceberg/type.h"
3437

3538
namespace iceberg {
@@ -40,7 +43,7 @@ namespace iceberg {
4043
struct ICEBERG_EXPORT PartitionFieldSummary {
4144
/// Field id: 509
4245
/// Whether the manifest contains at least one partition with a null value for the field
43-
bool contains_null;
46+
bool contains_null = true;
4447
/// Field id: 518
4548
/// Whether the manifest contains at least one partition with a NaN value for the field
4649
std::optional<bool> contains_nan;
@@ -64,6 +67,8 @@ struct ICEBERG_EXPORT PartitionFieldSummary {
6467
inline static const SchemaField kUpperBound = SchemaField::MakeOptional(
6568
511, "upper_bound", iceberg::binary(), "Partition upper bound for all files");
6669

70+
bool operator==(const PartitionFieldSummary& other) const = default;
71+
6772
static const StructType& Type();
6873
};
6974

@@ -83,26 +88,26 @@ struct ICEBERG_EXPORT ManifestFile {
8388
std::string manifest_path;
8489
/// Field id: 501
8590
/// Length of the manifest file in bytes
86-
int64_t manifest_length;
91+
int64_t manifest_length = 0;
8792
/// Field id: 502
8893
/// ID of a partition spec used to write the manifest; must be listed in table metadata
8994
/// partition-specs
90-
int32_t partition_spec_id;
95+
int32_t partition_spec_id = PartitionSpec::kInitialSpecId;
9196
/// Field id: 517
9297
/// The type of files tracked by the manifest, either data or delete files; 0 for all v1
9398
/// manifests
94-
Content content;
99+
Content content = Content::kData;
95100
/// Field id: 515
96101
/// The sequence number when the manifest was added to the table; use 0 when reading v1
97102
/// manifest lists
98-
int64_t sequence_number;
103+
int64_t sequence_number = TableMetadata::kInitialSequenceNumber;
99104
/// Field id: 516
100105
/// The minimum data sequence number of all live data or delete files in the manifest;
101106
/// use 0 when reading v1 manifest lists
102-
int64_t min_sequence_number;
107+
int64_t min_sequence_number = TableMetadata::kInitialSequenceNumber;
103108
/// Field id: 503
104109
/// ID of the snapshot where the manifest file was added
105-
int64_t added_snapshot_id;
110+
int64_t added_snapshot_id = Snapshot::kInvalidSnapshotId;
106111
/// Field id: 504
107112
/// Number of entries in the manifest that have status ADDED (1), when null this is
108113
/// assumed to be non-zero
@@ -137,7 +142,7 @@ struct ICEBERG_EXPORT ManifestFile {
137142
std::vector<uint8_t> key_metadata;
138143
/// Field id: 520
139144
/// The starting _row_id to assign to rows added by ADDED data files
140-
int64_t first_row_id;
145+
std::optional<int64_t> first_row_id;
141146

142147
/// \brief Checks if this manifest file contains entries with ADDED status.
143148
bool has_added_files() const { return added_files_count.value_or(1) > 0; }
@@ -188,6 +193,8 @@ struct ICEBERG_EXPORT ManifestFile {
188193
520, "first_row_id", iceberg::int64(),
189194
"Starting row ID to assign to new rows in ADDED data files");
190195

196+
bool operator==(const ManifestFile& other) const = default;
197+
191198
static const StructType& Type();
192199
};
193200

test/manifest_list_reader_test.cc

Lines changed: 43 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,46 @@ class ManifestListReaderTest : public TempFileTestBase {
4242
file_io_ = std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs_);
4343
}
4444

45+
std::vector<ManifestFile> PrepareTestManifestList() {
46+
std::vector<ManifestFile> manifest_files;
47+
std::string test_dir_prefix = "/tmp/db/db/iceberg_test/metadata/";
48+
std::vector<std::string> paths = {"2bccd69e-d642-4816-bba0-261cd9bd0d93-m0.avro",
49+
"9b6ffacd-ef10-4abf-a89c-01c733696796-m0.avro",
50+
"2541e6b5-4923-4bd5-886d-72c6f7228400-m0.avro",
51+
"3118c801-d2e0-4df6-8c7a-7d4eaade32f8-m0.avro"};
52+
std::vector<int64_t> file_size = {7433, 7431, 7433, 7431};
53+
std::vector<int64_t> snapshot_id = {7412193043800610213, 5485972788975780755,
54+
1679468743751242972, 1579605567338877265};
55+
std::vector<std::vector<uint8_t>> bounds = {{'x', ';', 0x07, 0x00},
56+
{'(', 0x19, 0x07, 0x00},
57+
{0xd0, 0xd4, 0x06, 0x00},
58+
{0xb8, 0xd4, 0x06, 0x00}};
59+
for (int i = 0; i < 4; ++i) {
60+
ManifestFile manifest_file;
61+
manifest_file.manifest_path = test_dir_prefix + paths[i];
62+
manifest_file.manifest_length = file_size[i];
63+
manifest_file.partition_spec_id = 0;
64+
manifest_file.content = ManifestFile::Content::kData;
65+
manifest_file.sequence_number = 4 - i;
66+
manifest_file.min_sequence_number = 4 - i;
67+
manifest_file.added_snapshot_id = snapshot_id[i];
68+
manifest_file.added_files_count = 1;
69+
manifest_file.existing_files_count = 0;
70+
manifest_file.deleted_files_count = 0;
71+
manifest_file.added_rows_count = 1;
72+
manifest_file.existing_rows_count = 0;
73+
manifest_file.deleted_rows_count = 0;
74+
PartitionFieldSummary partition;
75+
partition.contains_null = false;
76+
partition.contains_nan = false;
77+
partition.lower_bound = bounds[i];
78+
partition.upper_bound = bounds[i];
79+
manifest_file.partitions.emplace_back(partition);
80+
manifest_files.emplace_back(manifest_file);
81+
}
82+
return manifest_files;
83+
}
84+
4585
std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_;
4686
std::shared_ptr<FileIO> file_io_;
4787
};
@@ -55,74 +95,9 @@ TEST_F(ManifestListReaderTest, BasicTest) {
5595
auto read_result = manifest_reader->Files();
5696
ASSERT_EQ(read_result.has_value(), true);
5797
ASSERT_EQ(read_result.value().size(), 4);
58-
std::string test_dir_prefix = "/tmp/db/db/iceberg_test/metadata/";
59-
for (const auto& file : read_result.value()) {
60-
auto manifest_path = file.manifest_path.substr(test_dir_prefix.size());
61-
if (manifest_path == "2bccd69e-d642-4816-bba0-261cd9bd0d93-m0.avro") {
62-
ASSERT_EQ(file.added_snapshot_id, 7412193043800610213);
63-
ASSERT_EQ(file.manifest_length, 7433);
64-
ASSERT_EQ(file.sequence_number, 4);
65-
ASSERT_EQ(file.min_sequence_number, 4);
66-
ASSERT_EQ(file.partitions.size(), 1);
67-
const auto& partition = file.partitions[0];
68-
ASSERT_EQ(partition.contains_null, false);
69-
ASSERT_EQ(partition.contains_nan.value(), false);
70-
ASSERT_EQ(partition.lower_bound.value(),
71-
std::vector<uint8_t>({'x', ';', 0x07, 0x00}));
72-
ASSERT_EQ(partition.upper_bound.value(),
73-
std::vector<uint8_t>({'x', ';', 0x07, 0x00}));
74-
} else if (manifest_path == "9b6ffacd-ef10-4abf-a89c-01c733696796-m0.avro") {
75-
ASSERT_EQ(file.added_snapshot_id, 5485972788975780755);
76-
ASSERT_EQ(file.manifest_length, 7431);
77-
ASSERT_EQ(file.sequence_number, 3);
78-
ASSERT_EQ(file.min_sequence_number, 3);
79-
ASSERT_EQ(file.partitions.size(), 1);
80-
const auto& partition = file.partitions[0];
81-
ASSERT_EQ(partition.contains_null, false);
82-
ASSERT_EQ(partition.contains_nan.value(), false);
83-
ASSERT_EQ(partition.lower_bound.value(),
84-
std::vector<uint8_t>({'(', 0x19, 0x07, 0x00}));
85-
ASSERT_EQ(partition.upper_bound.value(),
86-
std::vector<uint8_t>({'(', 0x19, 0x07, 0x00}));
87-
} else if (manifest_path == "2541e6b5-4923-4bd5-886d-72c6f7228400-m0.avro") {
88-
ASSERT_EQ(file.added_snapshot_id, 1679468743751242972);
89-
ASSERT_EQ(file.manifest_length, 7433);
90-
ASSERT_EQ(file.sequence_number, 2);
91-
ASSERT_EQ(file.min_sequence_number, 2);
92-
ASSERT_EQ(file.partitions.size(), 1);
93-
const auto& partition = file.partitions[0];
94-
ASSERT_EQ(partition.contains_null, false);
95-
ASSERT_EQ(partition.contains_nan.value(), false);
96-
ASSERT_EQ(partition.lower_bound.value(),
97-
std::vector<uint8_t>({0xd0, 0xd4, 0x06, 0x00}));
98-
ASSERT_EQ(partition.upper_bound.value(),
99-
std::vector<uint8_t>({0xd0, 0xd4, 0x06, 0x00}));
100-
} else if (manifest_path == "3118c801-d2e0-4df6-8c7a-7d4eaade32f8-m0.avro") {
101-
ASSERT_EQ(file.added_snapshot_id, 1579605567338877265);
102-
ASSERT_EQ(file.manifest_length, 7431);
103-
ASSERT_EQ(file.sequence_number, 1);
104-
ASSERT_EQ(file.min_sequence_number, 1);
105-
ASSERT_EQ(file.partitions.size(), 1);
106-
const auto& partition = file.partitions[0];
107-
ASSERT_EQ(partition.contains_null, false);
108-
ASSERT_EQ(partition.contains_nan.value(), false);
109-
ASSERT_EQ(partition.lower_bound.value(),
110-
std::vector<uint8_t>({0xb8, 0xd4, 0x06, 0x00}));
111-
ASSERT_EQ(partition.upper_bound.value(),
112-
std::vector<uint8_t>({0xb8, 0xd4, 0x06, 0x00}));
113-
} else {
114-
ASSERT_TRUE(false) << "Unexpected manifest file: " << manifest_path;
115-
}
116-
ASSERT_EQ(file.partition_spec_id, 0);
117-
ASSERT_EQ(file.content, ManifestFile::Content::kData);
118-
ASSERT_EQ(file.added_files_count, 1);
119-
ASSERT_EQ(file.existing_files_count, 0);
120-
ASSERT_EQ(file.deleted_files_count, 0);
121-
ASSERT_EQ(file.added_rows_count, 1);
122-
ASSERT_EQ(file.existing_rows_count, 0);
123-
ASSERT_EQ(file.deleted_rows_count, 0);
124-
ASSERT_EQ(file.key_metadata.empty(), true);
125-
}
98+
99+
auto expected_manifest_list = PrepareTestManifestList();
100+
ASSERT_EQ(read_result.value(), expected_manifest_list);
126101
}
127102

128103
} // namespace iceberg

0 commit comments

Comments
 (0)