Skip to content

Commit 260ef54

Browse files
authored
Write small decimals as INTs (#1983)
# Rationale for this change Resolves #1979 # Are these changes tested? # Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent 6867b2d commit 260ef54

File tree

2 files changed

+40
-1
lines changed

2 files changed

+40
-1
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,12 @@ def visit_fixed(self, fixed_type: FixedType) -> pa.DataType:
636636
return pa.binary(len(fixed_type))
637637

638638
def visit_decimal(self, decimal_type: DecimalType) -> pa.DataType:
639+
# It looks like decimal{32,64} is not fully implemented:
640+
# https://github.com/apache/arrow/issues/25483
641+
# https://github.com/apache/arrow/issues/43956
642+
# However, if we keep it as 128 in memory, and based on the
643+
# precision/scale Arrow will map it to INT{32,64}
644+
# https://github.com/apache/arrow/blob/598938711a8376cbfdceaf5c77ab0fd5057e6c02/cpp/src/parquet/arrow/schema.cc#L380-L392
639645
return pa.decimal128(decimal_type.precision, decimal_type.scale)
640646

641647
def visit_boolean(self, _: BooleanType) -> pa.DataType:
@@ -2442,7 +2448,9 @@ def write_parquet(task: WriteTask) -> DataFile:
24422448
)
24432449
fo = io.new_output(file_path)
24442450
with fo.create(overwrite=True) as fos:
2445-
with pq.ParquetWriter(fos, schema=arrow_table.schema, **parquet_writer_kwargs) as writer:
2451+
with pq.ParquetWriter(
2452+
fos, schema=arrow_table.schema, store_decimal_as_integer=True, **parquet_writer_kwargs
2453+
) as writer:
24462454
writer.write(arrow_table, row_group_size=row_group_size)
24472455
statistics = data_file_statistics_from_parquet_metadata(
24482456
parquet_metadata=writer.writer.metadata,

tests/integration/test_writes/test_writes.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import random
2121
import time
2222
from datetime import date, datetime, timedelta
23+
from decimal import Decimal
2324
from pathlib import Path
2425
from typing import Any, Dict
2526
from urllib.parse import urlparse
@@ -50,6 +51,7 @@
5051
from pyiceberg.transforms import DayTransform, HourTransform, IdentityTransform
5152
from pyiceberg.types import (
5253
DateType,
54+
DecimalType,
5355
DoubleType,
5456
IntegerType,
5557
ListType,
@@ -1810,3 +1812,32 @@ def test_evolve_and_write(
18101812
)
18111813

18121814
assert session_catalog.load_table(identifier).scan().to_arrow().column(0).combine_chunks() == numbers
1815+
1816+
1817+
@pytest.mark.integration
1818+
def test_read_write_decimals(session_catalog: Catalog) -> None:
1819+
"""Roundtrip decimal types to make sure that we correctly write them as ints"""
1820+
identifier = "default.test_read_write_decimals"
1821+
1822+
arrow_table = pa.Table.from_pydict(
1823+
{
1824+
"decimal8": pa.array([Decimal("123.45"), Decimal("678.91")], pa.decimal128(8, 2)),
1825+
"decimal16": pa.array([Decimal("12345679.123456"), Decimal("67891234.678912")], pa.decimal128(16, 6)),
1826+
"decimal19": pa.array([Decimal("1234567890123.123456"), Decimal("9876543210703.654321")], pa.decimal128(19, 6)),
1827+
},
1828+
)
1829+
1830+
tbl = _create_table(
1831+
session_catalog,
1832+
identifier,
1833+
properties={"format-version": 2},
1834+
schema=Schema(
1835+
NestedField(1, "decimal8", DecimalType(8, 2)),
1836+
NestedField(2, "decimal16", DecimalType(16, 6)),
1837+
NestedField(3, "decimal19", DecimalType(19, 6)),
1838+
),
1839+
)
1840+
1841+
tbl.append(arrow_table)
1842+
1843+
assert tbl.scan().to_arrow() == arrow_table

0 commit comments

Comments
 (0)