Skip to content

Commit 75cd77f

Browse files
Fix precision loss in large integral string conversions (#3405)
## Summary Fixes precision loss when converting large integral strings in two runtime paths: - `StringLiteral.to(IntegerType/LongType)` - `partition_to_py(...)` for integral and time-based partition values backed by integers ## Root cause Both paths were converting through `float` before converting to `int`, which loses precision for values outside the IEEE-754 exact integer range. That caused valid 64-bit integers like `LongType.max` and `9007199254740993` to be corrupted. ## What changed - Replaced `int(float(...))` with exact integer parsing in `partition_to_py` - For `StringLiteral.to(IntegerType/LongType)`, exact integral strings now use exact integer parsing while fractional numeric strings retain the existing truncation behavior - Added regression tests for `LongType.max` and `9007199254740993` ## Validation - `uv run pytest tests/expressions/test_literals.py tests/test_conversions.py` Closes #3404.
1 parent b7ca7be commit 75cd77f

4 files changed

Lines changed: 50 additions & 7 deletions

File tree

pyiceberg/conversions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def _(primitive_type: PrimitiveType, value_str: str) -> int:
143143
_, _, exponent = Decimal(value_str).as_tuple()
144144
if exponent != 0: # Raise if there are digits to the right of the decimal
145145
raise ValueError(f"Cannot convert partition value, value cannot have fractional digits for {primitive_type} partition")
146-
return int(float(value_str))
146+
return int(value_str)
147147

148148

149149
@partition_to_py.register(FloatType)

pyiceberg/expressions/literals.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@
6868
UUID_BYTES_LENGTH = 16
6969

7070

71+
def _parse_numeric_string(value: str) -> Decimal:
72+
number = Decimal(value)
73+
if not number.is_finite():
74+
raise ValueError(f"Cannot convert non-finite numeric string: {value}")
75+
return number
76+
77+
7178
class Literal(IcebergRootModel[L], Generic[L], ABC): # type: ignore
7279
"""Literal which has a value and can be converted between types."""
7380

@@ -555,27 +562,27 @@ def _(self, _: StringType) -> Literal[str]:
555562
@to.register(IntegerType)
556563
def _(self, type_var: IntegerType) -> Literal[int]:
557564
try:
558-
number = int(float(self.value))
565+
number = _parse_numeric_string(self.value)
559566

560567
if IntegerType.max < number:
561568
return IntAboveMax()
562569
elif IntegerType.min > number:
563570
return IntBelowMin()
564-
return LongLiteral(number)
565-
except ValueError as e:
571+
return LongLiteral(int(number))
572+
except (ArithmeticError, OverflowError, ValueError) as e:
566573
raise ValueError(f"Could not convert {self.value} into a {type_var}") from e
567574

568575
@to.register(LongType)
569576
def _(self, type_var: LongType) -> Literal[int]:
570577
try:
571-
long_value = int(float(self.value))
578+
long_value = _parse_numeric_string(self.value)
572579
if LongType.max < long_value:
573580
return LongAboveMax()
574581
elif LongType.min > long_value:
575582
return LongBelowMin()
576583
else:
577-
return LongLiteral(long_value)
578-
except (TypeError, ValueError) as e:
584+
return LongLiteral(int(long_value))
585+
except (ArithmeticError, OverflowError, TypeError, ValueError) as e:
579586
raise ValueError(f"Could not convert {self.value} into a {type_var}") from e
580587

581588
@to.register(DateType)

tests/expressions/test_literals.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
IntAboveMax,
4040
IntBelowMin,
4141
Literal,
42+
LongAboveMax,
4243
LongLiteral,
4344
StringLiteral,
4445
TimeLiteral,
@@ -845,6 +846,38 @@ def test_string_to_int_min_value() -> None:
845846
assert isinstance(literal(str(IntegerType.min - 1)).to(IntegerType()), IntBelowMin)
846847

847848

849+
def test_string_to_long_max_value_without_precision_loss() -> None:
850+
assert literal(str(LongType.max)).to(LongType()) == literal(LongType.max)
851+
852+
853+
def test_string_to_long_large_integer_without_precision_loss() -> None:
854+
assert literal("9007199254740993").to(LongType()) == literal(9007199254740993)
855+
856+
857+
def test_string_to_long_decimal_like_integer_without_precision_loss() -> None:
858+
assert literal("9007199254740993.0").to(LongType()) == literal(9007199254740993)
859+
860+
861+
def test_string_to_long_scientific_notation_integer_without_precision_loss() -> None:
862+
assert literal("9007199254740993e0").to(LongType()) == literal(9007199254740993)
863+
864+
865+
def test_string_to_long_max_decimal_like_integer_without_precision_loss() -> None:
866+
assert literal(f"{LongType.max}.0").to(LongType()) == literal(LongType.max)
867+
868+
869+
def test_string_to_integer_scientific_notation_without_regression() -> None:
870+
assert literal("1e3").to(IntegerType()) == literal(1000)
871+
872+
873+
def test_string_to_integer_large_scientific_notation_above_max() -> None:
874+
assert isinstance(literal("1e1000000").to(IntegerType()), IntAboveMax)
875+
876+
877+
def test_string_to_long_large_scientific_notation_above_max() -> None:
878+
assert isinstance(literal("1e1000000").to(LongType()), LongAboveMax)
879+
880+
848881
def test_string_to_integer_type_invalid_value() -> None:
849882
with pytest.raises(ValueError) as e:
850883
_ = literal("abc").to(IntegerType())

tests/test_conversions.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,11 @@ def test_unscaled_to_decimal(unscaled: int, scale: int, expected_result: Decimal
160160
(IntegerType(), "1", 1),
161161
(IntegerType(), "9999", 9999),
162162
(LongType(), "123456789", 123456789),
163+
(LongType(), "9007199254740993", 9007199254740993),
164+
(LongType(), str(LongType.max), LongType.max),
163165
(FloatType(), "1.1", 1.1),
164166
(DoubleType(), "99999.9", 99999.9),
167+
(TimestampNanoType(), "9007199254740993", 9007199254740993),
165168
(DecimalType(5, 2), "123.45", Decimal("123.45")),
166169
(StringType(), "foo", "foo"),
167170
(UUIDType(), "f79c3e09-677c-4bbd-a479-3f349cb785e7", uuid.UUID("f79c3e09-677c-4bbd-a479-3f349cb785e7")),

0 commit comments

Comments
 (0)