Skip to content

Commit d7e2685

Browse files
authored
fix: TableMetadata max sequence number validation (#1167)
## What changes are included in this PR? Ensures that the sequence number of any snapshot in TableMetadata is lower or equal to the `last_sequence_number`. In Java this happens during indexing: https://github.com/apache/iceberg/blob/0152075268851d397b60bcb66e5a7c346271dfd9/core/src/main/java/org/apache/iceberg/TableMetadata.java#L842
1 parent e3ef617 commit d7e2685

File tree

1 file changed

+103
-2
lines changed

1 file changed

+103
-2
lines changed

crates/iceberg/src/spec/table_metadata.rs

Lines changed: 103 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ impl TableMetadata {
434434
self.validate_chronological_metadata_logs()?;
435435
// Normalize location (remove trailing slash)
436436
self.location = self.location.trim_end_matches('/').to_string();
437-
self.validate_format_version_specifics()?;
437+
self.validate_snapshot_sequence_number()?;
438438
self.try_normalize_partition_spec()?;
439439
self.try_normalize_sort_order()?;
440440
Ok(self)
@@ -547,7 +547,7 @@ impl TableMetadata {
547547
}
548548

549549
/// Validate that for V1 Metadata the last_sequence_number is 0
550-
fn validate_format_version_specifics(&self) -> Result<()> {
550+
fn validate_snapshot_sequence_number(&self) -> Result<()> {
551551
if self.format_version < FormatVersion::V2 && self.last_sequence_number != 0 {
552552
return Err(Error::new(
553553
ErrorKind::DataInvalid,
@@ -558,6 +558,24 @@ impl TableMetadata {
558558
));
559559
}
560560

561+
if self.format_version >= FormatVersion::V2 {
562+
if let Some(snapshot) = self
563+
.snapshots
564+
.values()
565+
.find(|snapshot| snapshot.sequence_number() > self.last_sequence_number)
566+
{
567+
return Err(Error::new(
568+
ErrorKind::DataInvalid,
569+
format!(
570+
"Invalid snapshot with id {} and sequence number {} greater than last sequence number {}",
571+
snapshot.snapshot_id(),
572+
snapshot.sequence_number(),
573+
self.last_sequence_number
574+
),
575+
));
576+
}
577+
}
578+
561579
Ok(())
562580
}
563581

@@ -2015,6 +2033,89 @@ mod tests {
20152033
.contains("Snapshot for reference foo does not exist in the existing snapshots list"));
20162034
}
20172035

2036+
#[test]
2037+
fn test_v2_wrong_max_snapshot_sequence_number() {
2038+
let data = r#"
2039+
{
2040+
"format-version": 2,
2041+
"table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1",
2042+
"location": "s3://bucket/test/location",
2043+
"last-sequence-number": 1,
2044+
"last-updated-ms": 1602638573590,
2045+
"last-column-id": 3,
2046+
"current-schema-id": 0,
2047+
"schemas": [
2048+
{
2049+
"type": "struct",
2050+
"schema-id": 0,
2051+
"fields": [
2052+
{
2053+
"id": 1,
2054+
"name": "x",
2055+
"required": true,
2056+
"type": "long"
2057+
}
2058+
]
2059+
}
2060+
],
2061+
"default-spec-id": 0,
2062+
"partition-specs": [
2063+
{
2064+
"spec-id": 0,
2065+
"fields": []
2066+
}
2067+
],
2068+
"last-partition-id": 1000,
2069+
"default-sort-order-id": 0,
2070+
"sort-orders": [
2071+
{
2072+
"order-id": 0,
2073+
"fields": []
2074+
}
2075+
],
2076+
"properties": {},
2077+
"current-snapshot-id": 3055729675574597004,
2078+
"snapshots": [
2079+
{
2080+
"snapshot-id": 3055729675574597004,
2081+
"timestamp-ms": 1555100955770,
2082+
"sequence-number": 4,
2083+
"summary": {
2084+
"operation": "append"
2085+
},
2086+
"manifest-list": "s3://a/b/2.avro",
2087+
"schema-id": 0
2088+
}
2089+
],
2090+
"statistics": [],
2091+
"snapshot-log": [],
2092+
"metadata-log": []
2093+
}
2094+
"#;
2095+
2096+
let err = serde_json::from_str::<TableMetadata>(data).unwrap_err();
2097+
println!("{}", err);
2098+
assert!(err.to_string().contains(
2099+
"Invalid snapshot with id 3055729675574597004 and sequence number 4 greater than last sequence number 1"
2100+
));
2101+
2102+
// Change max sequence number to 4 - should work
2103+
let data = data.replace(
2104+
r#""last-sequence-number": 1,"#,
2105+
r#""last-sequence-number": 4,"#,
2106+
);
2107+
let metadata = serde_json::from_str::<TableMetadata>(data.as_str()).unwrap();
2108+
assert_eq!(metadata.last_sequence_number, 4);
2109+
2110+
// Change max sequence number to 5 - should work
2111+
let data = data.replace(
2112+
r#""last-sequence-number": 4,"#,
2113+
r#""last-sequence-number": 5,"#,
2114+
);
2115+
let metadata = serde_json::from_str::<TableMetadata>(data.as_str()).unwrap();
2116+
assert_eq!(metadata.last_sequence_number, 5);
2117+
}
2118+
20182119
#[test]
20192120
fn test_statistic_files() {
20202121
let data = r#"

0 commit comments

Comments
 (0)