Skip to content

Commit 45a2bd2

Browse files
committed
fix: workaround for Add actions being read slightly differently out of parquet files
This workaround fixes #3030 but I'm not quite happy about how. I believe there is likely an upstream issue in arrow that I have not been able to reproduce where the nullable struct (deletionVector) which has non-nullable members (e.g. storageType) that are being read as empty strings rather than `null`. This issue started to appear with the v0.22 release which included an upgrade to datafusion 43 and arrow 53. I believe the latter is causing the issue here since it affects non-datafusion code paths. This workaround at least allows us to move forward and release a v0.22.1 while continuing to hunt down the issue. Signed-off-by: R. Tyler Croy <[email protected]>
1 parent d063b48 commit 45a2bd2

File tree

4 files changed

+89
-9
lines changed

4 files changed

+89
-9
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ debug = "line-tables-only"
2727

2828
[workspace.dependencies]
2929
delta_kernel = { version = "0.4.1", features = ["sync-engine"] }
30-
# delta_kernel = { path = "../delta-kernel-rs/kernel", version = "0.3.0" }
30+
#delta_kernel = { path = "../delta-kernel-rs/kernel", features = ["sync-engine"] }
3131

3232
# arrow
3333
arrow = { version = "53" }

crates/core/src/kernel/snapshot/log_data.rs

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -245,14 +245,23 @@ impl LogicalFile<'_> {
245245

246246
/// Defines a deletion vector
247247
pub fn deletion_vector(&self) -> Option<DeletionVectorView<'_>> {
248-
self.deletion_vector.as_ref().and_then(|arr| {
249-
arr.storage_type
250-
.is_valid(self.index)
251-
.then_some(DeletionVectorView {
248+
if let Some(arr) = self.deletion_vector.as_ref() {
249+
// With v0.22 and the upgrade to a more recent arrow. Reading nullable structs with
250+
// non-nullable entries back out of parquet is resulting in the DeletionVector having
251+
// an empty string rather than a null. The addition check on the value ensures that a
252+
// [DeletionVectorView] is not created in this scenario
253+
//
254+
// <https://github.com/delta-io/delta-rs/issues/3030>
255+
if arr.storage_type.is_valid(self.index)
256+
&& !arr.storage_type.value(self.index).is_empty()
257+
{
258+
return Some(DeletionVectorView {
252259
data: arr,
253260
index: self.index,
254-
})
255-
})
261+
});
262+
}
263+
}
264+
None
256265
}
257266

258267
/// The number of records stored in the data file.
@@ -509,7 +518,7 @@ mod datafusion {
509518
fn collect_count(&self, name: &str) -> Precision<usize> {
510519
let num_records = extract_and_cast_opt::<Int64Array>(self.stats, name);
511520
if let Some(num_records) = num_records {
512-
if num_records.len() == 0 {
521+
if num_records.is_empty() {
513522
Precision::Exact(0)
514523
} else if let Some(null_count_mulls) = num_records.nulls() {
515524
if null_count_mulls.null_count() > 0 {

crates/core/src/protocol/checkpoints.rs

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@ mod tests {
567567
use crate::operations::DeltaOps;
568568
use crate::protocol::Metadata;
569569
use crate::writer::test_utils::get_delta_schema;
570+
use crate::DeltaResult;
570571

571572
#[tokio::test]
572573
async fn test_create_checkpoint_for() {
@@ -1102,7 +1103,7 @@ mod tests {
11021103

11031104
#[ignore = "This test is only useful if the batch size has been made small"]
11041105
#[tokio::test]
1105-
async fn test_checkpoint_large_table() -> crate::DeltaResult<()> {
1106+
async fn test_checkpoint_large_table() -> DeltaResult<()> {
11061107
use crate::writer::test_utils::get_arrow_schema;
11071108

11081109
let table_schema = get_delta_schema();
@@ -1160,4 +1161,44 @@ mod tests {
11601161
);
11611162
Ok(())
11621163
}
1164+
1165+
/// <https://github.com/delta-io/delta-rs/issues/3030>
1166+
#[tokio::test]
1167+
async fn test_create_checkpoint_overwrite() -> DeltaResult<()> {
1168+
use crate::protocol::SaveMode;
1169+
use crate::writer::test_utils::get_arrow_schema;
1170+
1171+
let batch = RecordBatch::try_new(
1172+
Arc::clone(&get_arrow_schema(&None)),
1173+
vec![
1174+
Arc::new(arrow::array::StringArray::from(vec!["C"])),
1175+
Arc::new(arrow::array::Int32Array::from(vec![30])),
1176+
Arc::new(arrow::array::StringArray::from(vec!["2021-02-03"])),
1177+
],
1178+
)
1179+
.unwrap();
1180+
let table = DeltaOps::try_from_uri_with_storage_options("memory://", HashMap::default())
1181+
.await?
1182+
.write(vec![batch])
1183+
.await?;
1184+
assert_eq!(table.version(), 0);
1185+
1186+
create_checkpoint_for(0, table.snapshot().unwrap(), table.log_store.as_ref()).await?;
1187+
1188+
let batch = RecordBatch::try_new(
1189+
Arc::clone(&get_arrow_schema(&None)),
1190+
vec![
1191+
Arc::new(arrow::array::StringArray::from(vec!["A"])),
1192+
Arc::new(arrow::array::Int32Array::from(vec![0])),
1193+
Arc::new(arrow::array::StringArray::from(vec!["2021-02-02"])),
1194+
],
1195+
)
1196+
.unwrap();
1197+
let table = DeltaOps(table)
1198+
.write(vec![batch])
1199+
.with_save_mode(SaveMode::Overwrite)
1200+
.await?;
1201+
assert_eq!(table.version(), 1);
1202+
Ok(())
1203+
}
11631204
}

python/tests/test_checkpoint.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,3 +468,33 @@ def test_checkpoint_with_nullable_false(tmp_path: pathlib.Path):
468468
assert checkpoint_path.exists()
469469

470470
assert DeltaTable(str(tmp_table_path)).to_pyarrow_table() == data
471+
472+
473+
@pytest.mark.pandas
474+
def test_checkpoint_with_multiple_writes(tmp_path: pathlib.Path):
475+
import pandas as pd
476+
477+
write_deltalake(
478+
tmp_path,
479+
pd.DataFrame(
480+
{
481+
"a": ["a"],
482+
"b": [3],
483+
}
484+
),
485+
)
486+
DeltaTable(tmp_path).create_checkpoint()
487+
488+
dt = DeltaTable(tmp_path)
489+
print(dt.to_pandas())
490+
491+
write_deltalake(
492+
tmp_path,
493+
pd.DataFrame(
494+
{
495+
"a": ["a"],
496+
"b": [100],
497+
}
498+
),
499+
mode="overwrite",
500+
)

0 commit comments

Comments
 (0)