|
72 | 72 | expression_to_plain_format,
|
73 | 73 | rewrite_not,
|
74 | 74 | rewrite_to_dnf,
|
| 75 | + translate_column_names, |
75 | 76 | visit,
|
76 | 77 | visit_bound_predicate,
|
77 | 78 | )
|
78 | 79 | from pyiceberg.manifest import ManifestFile, PartitionFieldSummary
|
79 | 80 | from pyiceberg.schema import Accessor, Schema
|
80 | 81 | from pyiceberg.typedef import Record
|
81 | 82 | from pyiceberg.types import (
|
| 83 | + BooleanType, |
82 | 84 | DoubleType,
|
83 | 85 | FloatType,
|
84 | 86 | IcebergType,
|
@@ -1623,3 +1625,282 @@ def test_expression_evaluator_null() -> None:
|
1623 | 1625 | assert expression_evaluator(schema, LessThan("a", 1), case_sensitive=True)(struct) is False
|
1624 | 1626 | assert expression_evaluator(schema, StartsWith("a", 1), case_sensitive=True)(struct) is False
|
1625 | 1627 | assert expression_evaluator(schema, NotStartsWith("a", 1), case_sensitive=True)(struct) is True
|
| 1628 | + |
| 1629 | + |
| 1630 | +def test_translate_column_names_simple_case(table_schema_simple: Schema) -> None: |
| 1631 | + """Test translate_column_names with matching column names.""" |
| 1632 | + # Create a bound expression using the original schema |
| 1633 | + unbound_expr = EqualTo("foo", "test_value") |
| 1634 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=table_schema_simple, case_sensitive=True)) |
| 1635 | + |
| 1636 | + # File schema has the same column names |
| 1637 | + file_schema = Schema( |
| 1638 | + NestedField(field_id=1, name="foo", field_type=StringType(), required=False), |
| 1639 | + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), |
| 1640 | + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), |
| 1641 | + schema_id=1, |
| 1642 | + ) |
| 1643 | + |
| 1644 | + # Translate column names |
| 1645 | + translated_expr = translate_column_names(bound_expr, file_schema, case_sensitive=True) |
| 1646 | + |
| 1647 | + # Should return an unbound expression with the same column name since they match |
| 1648 | + assert isinstance(translated_expr, EqualTo) |
| 1649 | + assert translated_expr.term == Reference("foo") |
| 1650 | + assert translated_expr.literal == literal("test_value") |
| 1651 | + |
| 1652 | + |
| 1653 | +def test_translate_column_names_different_column_names() -> None: |
| 1654 | + """Test translate_column_names with different column names in file schema.""" |
| 1655 | + # Original schema |
| 1656 | + original_schema = Schema( |
| 1657 | + NestedField(field_id=1, name="original_name", field_type=StringType(), required=False), |
| 1658 | + schema_id=1, |
| 1659 | + ) |
| 1660 | + |
| 1661 | + # Create bound expression |
| 1662 | + unbound_expr = EqualTo("original_name", "test_value") |
| 1663 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=original_schema, case_sensitive=True)) |
| 1664 | + |
| 1665 | + # File schema has different column name but same field ID |
| 1666 | + file_schema = Schema( |
| 1667 | + NestedField(field_id=1, name="file_column_name", field_type=StringType(), required=False), |
| 1668 | + schema_id=1, |
| 1669 | + ) |
| 1670 | + |
| 1671 | + # Translate column names |
| 1672 | + translated_expr = translate_column_names(bound_expr, file_schema, case_sensitive=True) |
| 1673 | + |
| 1674 | + # Should use the file schema's column name |
| 1675 | + assert isinstance(translated_expr, EqualTo) |
| 1676 | + assert translated_expr.term == Reference("file_column_name") |
| 1677 | + assert translated_expr.literal == literal("test_value") |
| 1678 | + |
| 1679 | + |
| 1680 | +def test_translate_column_names_missing_column() -> None: |
| 1681 | + """Test translate_column_names when column is missing from file schema (such as in schema evolution).""" |
| 1682 | + # Original schema |
| 1683 | + original_schema = Schema( |
| 1684 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1685 | + NestedField(field_id=2, name="missing_col", field_type=IntegerType(), required=False), |
| 1686 | + schema_id=1, |
| 1687 | + ) |
| 1688 | + |
| 1689 | + # Create bound expression for the missing column |
| 1690 | + unbound_expr = EqualTo("missing_col", 42) |
| 1691 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=original_schema, case_sensitive=True)) |
| 1692 | + |
| 1693 | + # File schema only has the existing column (field_id=1), missing field_id=2 |
| 1694 | + file_schema = Schema( |
| 1695 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1696 | + schema_id=1, |
| 1697 | + ) |
| 1698 | + |
| 1699 | + # Translate column names |
| 1700 | + translated_expr = translate_column_names(bound_expr, file_schema, case_sensitive=True) |
| 1701 | + |
| 1702 | + # missing_col's default initial_default (None) does not match the expression literal (42) |
| 1703 | + assert translated_expr == AlwaysFalse() |
| 1704 | + |
| 1705 | + |
| 1706 | +def test_translate_column_names_missing_column_match_null() -> None: |
| 1707 | + """Test translate_column_names when missing column matches null.""" |
| 1708 | + # Original schema |
| 1709 | + original_schema = Schema( |
| 1710 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1711 | + NestedField(field_id=2, name="missing_col", field_type=IntegerType(), required=False), |
| 1712 | + schema_id=1, |
| 1713 | + ) |
| 1714 | + |
| 1715 | + # Create bound expression for the missing column |
| 1716 | + unbound_expr = IsNull("missing_col") |
| 1717 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=original_schema, case_sensitive=True)) |
| 1718 | + |
| 1719 | + # File schema only has the existing column (field_id=1), missing field_id=2 |
| 1720 | + file_schema = Schema( |
| 1721 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1722 | + schema_id=1, |
| 1723 | + ) |
| 1724 | + |
| 1725 | + # Translate column names |
| 1726 | + translated_expr = translate_column_names(bound_expr, file_schema, case_sensitive=True) |
| 1727 | + |
| 1728 | + # Should evaluate to AlwaysTrue because the missing column is treated as null |
| 1729 | + # missing_col's default initial_default (None) satisfies the IsNull predicate |
| 1730 | + assert translated_expr == AlwaysTrue() |
| 1731 | + |
| 1732 | + |
| 1733 | +def test_translate_column_names_missing_column_with_initial_default() -> None: |
| 1734 | + """Test translate_column_names when missing column's initial_default matches expression.""" |
| 1735 | + # Original schema |
| 1736 | + original_schema = Schema( |
| 1737 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1738 | + NestedField(field_id=2, name="missing_col", field_type=IntegerType(), required=False, initial_default=42), |
| 1739 | + schema_id=1, |
| 1740 | + ) |
| 1741 | + |
| 1742 | + # Create bound expression for the missing column |
| 1743 | + unbound_expr = EqualTo("missing_col", 42) |
| 1744 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=original_schema, case_sensitive=True)) |
| 1745 | + |
| 1746 | + # File schema only has the existing column (field_id=1), missing field_id=2 |
| 1747 | + file_schema = Schema( |
| 1748 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1749 | + schema_id=1, |
| 1750 | + ) |
| 1751 | + |
| 1752 | + # Translate column names |
| 1753 | + translated_expr = translate_column_names(bound_expr, file_schema, case_sensitive=True) |
| 1754 | + |
| 1755 | + # Should evaluate to AlwaysTrue because the initial_default value (42) matches the literal (42) |
| 1756 | + assert translated_expr == AlwaysTrue() |
| 1757 | + |
| 1758 | + |
| 1759 | +def test_translate_column_names_missing_column_with_initial_default_mismatch() -> None: |
| 1760 | + """Test translate_column_names when missing column's initial_default doesn't match expression.""" |
| 1761 | + # Original schema |
| 1762 | + original_schema = Schema( |
| 1763 | + NestedField(field_id=2, name="missing_col", field_type=IntegerType(), required=False, initial_default=10), |
| 1764 | + schema_id=1, |
| 1765 | + ) |
| 1766 | + |
| 1767 | + # Create bound expression that won't match the default value |
| 1768 | + unbound_expr = EqualTo("missing_col", 42) |
| 1769 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=original_schema, case_sensitive=True)) |
| 1770 | + |
| 1771 | + # File schema doesn't have this column |
| 1772 | + file_schema = Schema( |
| 1773 | + NestedField(field_id=1, name="other_col", field_type=StringType(), required=False), |
| 1774 | + schema_id=1, |
| 1775 | + ) |
| 1776 | + |
| 1777 | + # Translate column names |
| 1778 | + translated_expr = translate_column_names(bound_expr, file_schema, case_sensitive=True) |
| 1779 | + |
| 1780 | + # Should evaluate to AlwaysFalse because initial_default value (10) doesn't match literal (42) |
| 1781 | + assert translated_expr == AlwaysFalse() |
| 1782 | + |
| 1783 | + |
| 1784 | +def test_translate_column_names_missing_column_with_projected_field_matches() -> None: |
| 1785 | + """Test translate_column_names with projected field value that matches expression.""" |
| 1786 | + # Original schema with a field that has no initial_default (defaults to None) |
| 1787 | + original_schema = Schema( |
| 1788 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1789 | + NestedField(field_id=2, name="missing_col", field_type=IntegerType(), required=False), |
| 1790 | + schema_id=1, |
| 1791 | + ) |
| 1792 | + |
| 1793 | + # Create bound expression for the missing column |
| 1794 | + unbound_expr = EqualTo("missing_col", 42) |
| 1795 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=original_schema, case_sensitive=True)) |
| 1796 | + |
| 1797 | + # File schema only has the existing column (field_id=1), missing field_id=2 |
| 1798 | + file_schema = Schema( |
| 1799 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1800 | + schema_id=1, |
| 1801 | + ) |
| 1802 | + |
| 1803 | + # Projected column that is missing in the file schema |
| 1804 | + projected_field_values = {"missing_col": 42} |
| 1805 | + |
| 1806 | + # Translate column names |
| 1807 | + translated_expr = translate_column_names( |
| 1808 | + bound_expr, file_schema, case_sensitive=True, projected_field_values=projected_field_values |
| 1809 | + ) |
| 1810 | + |
| 1811 | + # Should evaluate to AlwaysTrue since projected field value matches the expression literal |
| 1812 | + # even though the field is missing in the file schema |
| 1813 | + assert translated_expr == AlwaysTrue() |
| 1814 | + |
| 1815 | + |
| 1816 | +def test_translate_column_names_missing_column_with_projected_field_mismatch() -> None: |
| 1817 | + """Test translate_column_names with projected field value that doesn't match expression.""" |
| 1818 | + # Original schema with a field that has no initial_default (defaults to None) |
| 1819 | + original_schema = Schema( |
| 1820 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1821 | + NestedField(field_id=2, name="missing_col", field_type=IntegerType(), required=False), |
| 1822 | + schema_id=1, |
| 1823 | + ) |
| 1824 | + |
| 1825 | + # Create bound expression for the missing column |
| 1826 | + unbound_expr = EqualTo("missing_col", 42) |
| 1827 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=original_schema, case_sensitive=True)) |
| 1828 | + |
| 1829 | + # File schema only has the existing column (field_id=1), missing field_id=2 |
| 1830 | + file_schema = Schema( |
| 1831 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1832 | + schema_id=1, |
| 1833 | + ) |
| 1834 | + |
| 1835 | + # Projected column that is missing in the file schema |
| 1836 | + projected_field_values = {"missing_col": 1} |
| 1837 | + |
| 1838 | + # Translate column names |
| 1839 | + translated_expr = translate_column_names( |
| 1840 | + bound_expr, file_schema, case_sensitive=True, projected_field_values=projected_field_values |
| 1841 | + ) |
| 1842 | + |
| 1843 | + # Should evaluate to AlwaysFalse since projected field value does not match the expression literal |
| 1844 | + assert translated_expr == AlwaysFalse() |
| 1845 | + |
| 1846 | + |
| 1847 | +def test_translate_column_names_missing_column_projected_field_fallbacks_to_initial_default() -> None: |
| 1848 | + """Test translate_column_names when projected field value doesn't match but initial_default does.""" |
| 1849 | + # Original schema with a field that has an initial_default |
| 1850 | + original_schema = Schema( |
| 1851 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1852 | + NestedField(field_id=2, name="missing_col", field_type=IntegerType(), required=False, initial_default=42), |
| 1853 | + schema_id=1, |
| 1854 | + ) |
| 1855 | + |
| 1856 | + # Create bound expression for the missing column that would match initial_default |
| 1857 | + unbound_expr = EqualTo("missing_col", 42) |
| 1858 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=original_schema, case_sensitive=True)) |
| 1859 | + |
| 1860 | + # File schema only has the existing column (field_id=1), missing field_id=2 |
| 1861 | + file_schema = Schema( |
| 1862 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1863 | + schema_id=1, |
| 1864 | + ) |
| 1865 | + |
| 1866 | + # Projected field value that differs from both the expression literal and initial_default |
| 1867 | + projected_field_values = {"missing_col": 10} # This doesn't match expression literal (42) |
| 1868 | + |
| 1869 | + # Translate column names |
| 1870 | + translated_expr = translate_column_names( |
| 1871 | + bound_expr, file_schema, case_sensitive=True, projected_field_values=projected_field_values |
| 1872 | + ) |
| 1873 | + |
| 1874 | + # Should evaluate to AlwaysTrue since projected field value doesn't match but initial_default does |
| 1875 | + assert translated_expr == AlwaysTrue() |
| 1876 | + |
| 1877 | + |
| 1878 | +def test_translate_column_names_missing_column_projected_field_matches_initial_default_mismatch() -> None: |
| 1879 | + """Test translate_column_names when both projected field value and initial_default doesn't match.""" |
| 1880 | + # Original schema with a field that has an initial_default that doesn't match the expression |
| 1881 | + original_schema = Schema( |
| 1882 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1883 | + NestedField(field_id=2, name="missing_col", field_type=IntegerType(), required=False, initial_default=10), |
| 1884 | + schema_id=1, |
| 1885 | + ) |
| 1886 | + |
| 1887 | + # Create bound expression for the missing column |
| 1888 | + unbound_expr = EqualTo("missing_col", 42) |
| 1889 | + bound_expr = visit(unbound_expr, visitor=BindVisitor(schema=original_schema, case_sensitive=True)) |
| 1890 | + |
| 1891 | + # File schema only has the existing column (field_id=1), missing field_id=2 |
| 1892 | + file_schema = Schema( |
| 1893 | + NestedField(field_id=1, name="existing_col", field_type=StringType(), required=False), |
| 1894 | + schema_id=1, |
| 1895 | + ) |
| 1896 | + |
| 1897 | + # Projected field value that matches the expression literal |
| 1898 | + projected_field_values = {"missing_col": 10} # This doesn't match expression literal (42) |
| 1899 | + |
| 1900 | + # Translate column names |
| 1901 | + translated_expr = translate_column_names( |
| 1902 | + bound_expr, file_schema, case_sensitive=True, projected_field_values=projected_field_values |
| 1903 | + ) |
| 1904 | + |
| 1905 | + # Should evaluate to AlwaysFalse since both projected field value and initial_default does not match |
| 1906 | + assert translated_expr == AlwaysFalse() |
0 commit comments