Skip to content

Commit 6c1a1b2

Browse files
authored
Handle stat collection for empty decimal columns (#2306)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change Empty Decimal columns fail to rescale (#2263) # Are these changes tested? Yes. Linting and tests were ran. # Are there any user-facing changes? No. <!-- In the case of user-facing changes, please add the changelog label. --> Closes #2263
1 parent bd5b8ba commit 6c1a1b2

File tree

2 files changed

+25
-4
lines changed

2 files changed

+25
-4
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2445,8 +2445,12 @@ def data_file_statistics_from_parquet_metadata(
24452445

24462446
if isinstance(stats_col.iceberg_type, DecimalType) and statistics.physical_type != "FIXED_LEN_BYTE_ARRAY":
24472447
scale = stats_col.iceberg_type.scale
2448-
col_aggs[field_id].update_min(unscaled_to_decimal(statistics.min_raw, scale))
2449-
col_aggs[field_id].update_max(unscaled_to_decimal(statistics.max_raw, scale))
2448+
col_aggs[field_id].update_min(
2449+
unscaled_to_decimal(statistics.min_raw, scale)
2450+
) if statistics.min_raw is not None else None
2451+
col_aggs[field_id].update_max(
2452+
unscaled_to_decimal(statistics.max_raw, scale)
2453+
) if statistics.max_raw is not None else None
24502454
else:
24512455
col_aggs[field_id].update_min(statistics.min)
24522456
col_aggs[field_id].update_max(statistics.max)

tests/io/test_pyarrow_stats.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,9 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, Union[Table
450450
{"id": 13, "name": "decimal8", "required": False, "type": "decimal(5, 2)"},
451451
{"id": 14, "name": "decimal16", "required": False, "type": "decimal(16, 6)"},
452452
{"id": 15, "name": "decimal32", "required": False, "type": "decimal(19, 6)"},
453+
{"id": 16, "name": "empty_decimal8", "required": False, "type": "decimal(5, 2)"},
454+
{"id": 17, "name": "empty_decimal16", "required": False, "type": "decimal(16, 6)"},
455+
{"id": 18, "name": "empty_decimal32", "required": False, "type": "decimal(19, 6)"},
453456
],
454457
},
455458
],
@@ -477,6 +480,9 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, Union[Table
477480
decimal8 = pa.array([Decimal("123.45"), Decimal("678.91")], pa.decimal128(8, 2))
478481
decimal16 = pa.array([Decimal("12345679.123456"), Decimal("67891234.678912")], pa.decimal128(16, 6))
479482
decimal32 = pa.array([Decimal("1234567890123.123456"), Decimal("9876543210703.654321")], pa.decimal128(19, 6))
483+
empty_decimal8 = pa.array([None, None], pa.decimal128(8, 2))
484+
empty_decimal16 = pa.array([None, None], pa.decimal128(16, 6))
485+
empty_decimal32 = pa.array([None, None], pa.decimal128(19, 6))
480486

481487
table = pa.Table.from_pydict(
482488
{
@@ -495,6 +501,9 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, Union[Table
495501
"decimal8": decimal8,
496502
"decimal16": decimal16,
497503
"decimal32": decimal32,
504+
"empty_decimal8": empty_decimal8,
505+
"empty_decimal16": empty_decimal16,
506+
"empty_decimal32": empty_decimal32,
498507
},
499508
schema=arrow_schema,
500509
)
@@ -520,8 +529,14 @@ def test_metrics_primitive_types() -> None:
520529
)
521530
datafile = DataFile.from_args(**statistics.to_serialized_dict())
522531

523-
assert len(datafile.value_counts) == 15
524-
assert len(datafile.null_value_counts) == 15
532+
assert len(datafile.value_counts) == 18
533+
assert len(datafile.null_value_counts) == 18
534+
assert datafile.null_value_counts[13] != datafile.value_counts[13]
535+
assert datafile.null_value_counts[14] != datafile.value_counts[14]
536+
assert datafile.null_value_counts[15] != datafile.value_counts[15]
537+
assert datafile.null_value_counts[16] == datafile.value_counts[16]
538+
assert datafile.null_value_counts[17] == datafile.value_counts[17]
539+
assert datafile.null_value_counts[18] == datafile.value_counts[18]
525540
assert len(datafile.nan_value_counts) == 0
526541

527542
tz = timezone(timedelta(seconds=19800))
@@ -542,6 +557,7 @@ def test_metrics_primitive_types() -> None:
542557
assert datafile.lower_bounds[13][::-1].ljust(4, b"\x00") == STRUCT_INT32.pack(12345)
543558
assert datafile.lower_bounds[14][::-1].ljust(8, b"\x00") == STRUCT_INT64.pack(12345679123456)
544559
assert str(int.from_bytes(datafile.lower_bounds[15], byteorder="big", signed=True)).encode("utf-8") == b"1234567890123123456"
560+
assert not any(key in datafile.lower_bounds.keys() for key in [16, 17, 18])
545561

546562
assert len(datafile.upper_bounds) == 15
547563
assert datafile.upper_bounds[1] == STRUCT_BOOL.pack(True)
@@ -559,6 +575,7 @@ def test_metrics_primitive_types() -> None:
559575
assert datafile.upper_bounds[13][::-1].ljust(4, b"\x00") == STRUCT_INT32.pack(67891)
560576
assert datafile.upper_bounds[14][::-1].ljust(8, b"\x00") == STRUCT_INT64.pack(67891234678912)
561577
assert str(int.from_bytes(datafile.upper_bounds[15], byteorder="big", signed=True)).encode("utf-8") == b"9876543210703654321"
578+
assert not any(key in datafile.upper_bounds.keys() for key in [16, 17, 18])
562579

563580

564581
def construct_test_table_invalid_upper_bound() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:

0 commit comments

Comments
 (0)