Skip to content

Commit 921f389

Browse files
authored
refactor: Move equality-ids closer to the spec (#1705)
## Which issue does this PR close? Right now the equality-deletes can't be not-null, while the spec states that it should be null in the case of a manifest-entry that's not an equality delete: <img width="835" height="341" alt="image" src="https://github.com/user-attachments/assets/60a88f37-7c50-48b7-8878-ecfe4bd70509" /> ## What changes are included in this PR? <!-- Provide a summary of the modifications in this PR. List the main changes such as new features, bug fixes, refactoring, or any other updates. --> ## Are these changes tested? <!-- Specify what test covers (unit test, integration test, etc.). If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? -->
1 parent cd7bfc0 commit 921f389

File tree

14 files changed

+51
-54
lines changed

14 files changed

+51
-54
lines changed

bindings/python/src/data_file.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ impl PyDataFile {
148148
}
149149

150150
#[getter]
151-
fn equality_ids(&self) -> &[i32] {
151+
fn equality_ids(&self) -> Option<Vec<i32>> {
152152
self.inner.equality_ids()
153153
}
154154

bindings/python/tests/test_manifest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,5 +138,5 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None:
138138
}
139139
assert data_file.key_metadata is None
140140
assert data_file.split_offsets == [4]
141-
assert data_file.equality_ids == []
141+
assert data_file.equality_ids is None
142142
assert data_file.sort_order_id == 0

crates/iceberg/src/arrow/caching_delete_file_loader.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ impl CachingDeleteFileLoader {
233233
)
234234
.await?,
235235
sender,
236-
equality_ids: HashSet::from_iter(task.equality_ids.clone()),
236+
equality_ids: HashSet::from_iter(task.equality_ids.clone().unwrap()),
237237
})
238238
}
239239

crates/iceberg/src/arrow/delete_filter.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -311,21 +311,21 @@ pub(crate) mod tests {
311311
file_path: format!("{}/pos-del-1.parquet", table_location.to_str().unwrap()),
312312
file_type: DataContentType::PositionDeletes,
313313
partition_spec_id: 0,
314-
equality_ids: vec![],
314+
equality_ids: None,
315315
};
316316

317317
let pos_del_2 = FileScanTaskDeleteFile {
318318
file_path: format!("{}/pos-del-2.parquet", table_location.to_str().unwrap()),
319319
file_type: DataContentType::PositionDeletes,
320320
partition_spec_id: 0,
321-
equality_ids: vec![],
321+
equality_ids: None,
322322
};
323323

324324
let pos_del_3 = FileScanTaskDeleteFile {
325325
file_path: format!("{}/pos-del-3.parquet", table_location.to_str().unwrap()),
326326
file_type: DataContentType::PositionDeletes,
327327
partition_spec_id: 0,
328-
equality_ids: vec![],
328+
equality_ids: None,
329329
};
330330

331331
let file_scan_tasks = vec![

crates/iceberg/src/expr/visitors/expression_evaluator.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ mod tests {
347347
upper_bounds: HashMap::new(),
348348
key_metadata: None,
349349
split_offsets: vec![],
350-
equality_ids: vec![],
350+
equality_ids: None,
351351
sort_order_id: None,
352352
partition_spec_id: 0,
353353
first_row_id: None,
@@ -375,7 +375,7 @@ mod tests {
375375
upper_bounds: HashMap::new(),
376376
key_metadata: None,
377377
split_offsets: vec![],
378-
equality_ids: vec![],
378+
equality_ids: None,
379379
sort_order_id: None,
380380
partition_spec_id: 0,
381381
first_row_id: None,

crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1996,7 +1996,7 @@ mod test {
19961996
upper_bounds: Default::default(),
19971997
key_metadata: None,
19981998
split_offsets: vec![],
1999-
equality_ids: vec![],
1999+
equality_ids: None,
20002000
sort_order_id: None,
20012001
partition_spec_id: 0,
20022002
first_row_id: None,
@@ -2022,7 +2022,7 @@ mod test {
20222022
upper_bounds: Default::default(),
20232023
key_metadata: None,
20242024
split_offsets: vec![],
2025-
equality_ids: vec![],
2025+
equality_ids: None,
20262026
sort_order_id: None,
20272027
partition_spec_id: 0,
20282028
first_row_id: None,
@@ -2084,7 +2084,7 @@ mod test {
20842084
column_sizes: Default::default(),
20852085
key_metadata: None,
20862086
split_offsets: vec![],
2087-
equality_ids: vec![],
2087+
equality_ids: None,
20882088
sort_order_id: None,
20892089
partition_spec_id: 0,
20902090
first_row_id: None,
@@ -2115,7 +2115,7 @@ mod test {
21152115
column_sizes: Default::default(),
21162116
key_metadata: None,
21172117
split_offsets: vec![],
2118-
equality_ids: vec![],
2118+
equality_ids: None,
21192119
sort_order_id: None,
21202120
partition_spec_id: 0,
21212121
first_row_id: None,
@@ -2147,7 +2147,7 @@ mod test {
21472147
column_sizes: Default::default(),
21482148
key_metadata: None,
21492149
split_offsets: vec![],
2150-
equality_ids: vec![],
2150+
equality_ids: None,
21512151
sort_order_id: None,
21522152
partition_spec_id: 0,
21532153
first_row_id: None,
@@ -2179,7 +2179,7 @@ mod test {
21792179
column_sizes: Default::default(),
21802180
key_metadata: None,
21812181
split_offsets: vec![],
2182-
equality_ids: vec![],
2182+
equality_ids: None,
21832183
sort_order_id: None,
21842184
partition_spec_id: 0,
21852185
first_row_id: None,

crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ mod test {
579579
column_sizes: Default::default(),
580580
key_metadata: None,
581581
split_offsets: vec![],
582-
equality_ids: vec![],
582+
equality_ids: None,
583583
sort_order_id: None,
584584
partition_spec_id: 0,
585585
first_row_id: None,
@@ -605,7 +605,7 @@ mod test {
605605
upper_bounds: Default::default(),
606606
key_metadata: None,
607607
split_offsets: vec![],
608-
equality_ids: vec![],
608+
equality_ids: None,
609609
sort_order_id: None,
610610
partition_spec_id: 0,
611611
first_row_id: None,
@@ -631,7 +631,7 @@ mod test {
631631
column_sizes: Default::default(),
632632
key_metadata: None,
633633
split_offsets: vec![],
634-
equality_ids: vec![],
634+
equality_ids: None,
635635
sort_order_id: None,
636636
partition_spec_id: 0,
637637
first_row_id: None,
@@ -658,7 +658,7 @@ mod test {
658658
column_sizes: Default::default(),
659659
key_metadata: None,
660660
split_offsets: vec![],
661-
equality_ids: vec![],
661+
equality_ids: None,
662662
sort_order_id: None,
663663
partition_spec_id: 0,
664664
first_row_id: None,

crates/iceberg/src/scan/task.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,6 @@ pub struct FileScanTaskDeleteFile {
112112
/// partition id
113113
pub partition_spec_id: i32,
114114

115-
/// equality ids for equality deletes (empty for positional deletes)
116-
pub equality_ids: Vec<i32>,
115+
/// equality ids for equality deletes (null for anything other than equality-deletes)
116+
pub equality_ids: Option<Vec<i32>>,
117117
}

crates/iceberg/src/spec/manifest/_serde.rs

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ pub(super) struct DataFileSerde {
116116
upper_bounds: Option<Vec<BytesEntry>>,
117117
key_metadata: Option<serde_bytes::ByteBuf>,
118118
split_offsets: Option<Vec<i64>>,
119-
#[serde(default)]
120119
equality_ids: Option<Vec<i32>>,
121120
sort_order_id: Option<i32>,
122121
first_row_id: Option<i64>,
@@ -155,7 +154,7 @@ impl DataFileSerde {
155154
upper_bounds: Some(to_bytes_entry(value.upper_bounds)?),
156155
key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from),
157156
split_offsets: Some(value.split_offsets),
158-
equality_ids: Some(value.equality_ids),
157+
equality_ids: value.equality_ids,
159158
sort_order_id: value.sort_order_id,
160159
first_row_id: value.first_row_id,
161160
referenced_data_file: value.referenced_data_file,
@@ -224,7 +223,7 @@ impl DataFileSerde {
224223
.unwrap_or_default(),
225224
key_metadata: self.key_metadata.map(|v| v.to_vec()),
226225
split_offsets: self.split_offsets.unwrap_or_default(),
227-
equality_ids: self.equality_ids.unwrap_or_default(),
226+
equality_ids: self.equality_ids,
228227
sort_order_id: self.sort_order_id,
229228
partition_spec_id,
230229
first_row_id: self.first_row_id,
@@ -382,7 +381,7 @@ mod tests {
382381
upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
383382
key_metadata: None,
384383
split_offsets: vec![4],
385-
equality_ids: vec![],
384+
equality_ids: None,
386385
sort_order_id: Some(0),
387386
partition_spec_id: 0,
388387
first_row_id: None,
@@ -517,9 +516,8 @@ mod tests {
517516
"DataFileSerde should convert content 0 to DataContentType::Data"
518517
);
519518
assert_eq!(
520-
v2_entry.data_file.equality_ids,
521-
Vec::<i32>::new(),
522-
"DataFileSerde should convert None equality_ids to empty vec"
519+
v2_entry.data_file.equality_ids, None,
520+
"DataFileSerde should preserve None equality_ids as None"
523521
);
524522

525523
// Verify other fields are preserved during conversion
@@ -581,9 +579,8 @@ mod tests {
581579
"content 0 should convert to DataContentType::Data"
582580
);
583581
assert_eq!(
584-
data_file.equality_ids,
585-
Vec::<i32>::new(),
586-
"None equality_ids should convert to empty vec via unwrap_or_default()"
582+
data_file.equality_ids, None,
583+
"None equality_ids should remain as None"
587584
);
588585

589586
// Verify other fields are handled correctly during conversion

crates/iceberg/src/spec/manifest/data_file.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ pub struct DataFile {
135135
/// otherwise. Fields with ids listed in this column must be present
136136
/// in the delete file
137137
#[builder(default)]
138-
pub(crate) equality_ids: Vec<i32>,
138+
pub(crate) equality_ids: Option<Vec<i32>>,
139139
/// field id: 140
140140
///
141141
/// ID representing sort order for this file.
@@ -249,8 +249,8 @@ impl DataFile {
249249
/// Get the equality ids of the data file.
250250
/// Field ids used to determine row equality in equality delete files.
251251
/// null when content is not EqualityDeletes.
252-
pub fn equality_ids(&self) -> &[i32] {
253-
&self.equality_ids
252+
pub fn equality_ids(&self) -> Option<Vec<i32>> {
253+
self.equality_ids.clone()
254254
}
255255
/// Get the first row id in the data file.
256256
pub fn first_row_id(&self) -> Option<i64> {

0 commit comments

Comments
 (0)