apache
diff --git a/‎dev/docker-compose-integration.yml
Lines changed: 1 addition & 0 deletions b/‎dev/docker-compose-integration.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎mkdocs/docs/api.md
Lines changed: 46 additions & 0 deletions b/‎mkdocs/docs/api.md
Lines changed: 46 additions & 0 deletions
diff --git a/‎mkdocs/docs/index.md
Lines changed: 1 addition & 0 deletions b/‎mkdocs/docs/index.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎poetry.lock
Lines changed: 832 additions & 631 deletions b/‎poetry.lock
Lines changed: 832 additions & 631 deletions
diff --git a/‎pyiceberg/catalog/hive.py
Lines changed: 1 addition & 1 deletion b/‎pyiceberg/catalog/hive.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyiceberg/catalog/rest/__init__.py
Lines changed: 1 addition & 1 deletion b/‎pyiceberg/catalog/rest/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyiceberg/io/pyarrow.py
Lines changed: 40 additions & 6 deletions b/‎pyiceberg/io/pyarrow.py
Lines changed: 40 additions & 6 deletions
diff --git a/‎pyiceberg/table/__init__.py
Lines changed: 19 additions & 1 deletion b/‎pyiceberg/table/__init__.py
Lines changed: 19 additions & 1 deletion
diff --git a/‎pyiceberg/table/snapshots.py
Lines changed: 3 additions & 0 deletions b/‎pyiceberg/table/snapshots.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎pyiceberg/table/statistics.py
Lines changed: 3 additions & 3 deletions b/‎pyiceberg/table/statistics.py
Lines changed: 3 additions & 3 deletions
@@ -53,6 +53,7 @@ services:
       - CATALOG_WAREHOUSE=s3://warehouse/
       - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
       - CATALOG_S3_ENDPOINT=http://minio:9000
+      - CATALOG_JDBC_STRICT__MODE=true
   minio:
     image: minio/minio
     container_name: pyiceberg-minio
 
@@ -1523,6 +1523,52 @@ print(ray_dataset.take(2))
 ]
 ```
 
+### Bodo
+
+PyIceberg interfaces closely with Bodo Dataframes (see [Bodo Iceberg Quick Start](https://docs.bodo.ai/latest/quick_start/quickstart_local_iceberg/)),
+which provides a drop-in replacement for Pandas that applies query, compiler and HPC optimizations automatically.
+Bodo accelerates and scales Python code from single laptops to large clusters without code rewrites.
+
+<!-- prettier-ignore-start -->
+
+!!! note "Requirements"
+    This requires [`bodo` to be installed](index.md).
+
+```python
+pip install pyiceberg['bodo']
+```
+<!-- prettier-ignore-end -->
+
+A table can be read easily into a Bodo Dataframe to perform Pandas operations:
+
+```python
+df = table.to_bodo()  # equivalent to `bodo.pandas.read_iceberg_table(table)`
+df = df[df["trip_distance"] >= 10.0]
+df = df[["VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime"]]
+print(df)
+```
+
+This creates a lazy query, optimizes it, and runs it on all available cores (print triggers execution):
+
+```python
+        VendorID tpep_pickup_datetime tpep_dropoff_datetime
+0              2  2023-01-01 00:27:12   2023-01-01 00:49:56
+1              2  2023-01-01 00:09:29   2023-01-01 00:29:23
+2              1  2023-01-01 00:13:30   2023-01-01 00:44:00
+3              2  2023-01-01 00:41:41   2023-01-01 01:19:32
+4              2  2023-01-01 00:22:39   2023-01-01 01:30:45
+...          ...                  ...                   ...
+245478         2  2023-01-31 22:32:57   2023-01-31 23:01:48
+245479         2  2023-01-31 22:03:26   2023-01-31 22:46:13
+245480         2  2023-01-31 23:25:56   2023-02-01 00:05:42
+245481         2  2023-01-31 23:18:00   2023-01-31 23:46:00
+245482         2  2023-01-31 23:18:00   2023-01-31 23:41:00
+
+[245483 rows x 3 columns]
+```
+
+Bodo is optimized to take advantage of Iceberg features such as hidden partitioning and various statistics for efficient reads.
+
 ### Daft
 
 PyIceberg interfaces closely with Daft Dataframes (see also: [Daft integration with Iceberg](https://docs.daft.ai/en/stable/io/iceberg/)) which provides a full lazily optimized query engine interface on top of PyIceberg tables.
 
@@ -52,6 +52,7 @@ You can mix and match optional dependencies depending on your needs:
 | pandas        | Installs both PyArrow and Pandas                                          |
 | duckdb        | Installs both PyArrow and DuckDB                                          |
 | ray           | Installs PyArrow, Pandas, and Ray                                         |
+| bodo          | Installs Bodo                                                             |
 | daft          | Installs Daft                                                             |
 | polars       | Installs Polars                                                           |
 | s3fs          | S3FS as a FileIO implementation to interact with the object store         |
 
@@ -809,7 +809,7 @@ def update_namespace_properties(
             if removals:
                 for key in removals:
                     if key in parameters:
-                        parameters[key] = None
+                        parameters.pop(key)
                         removed.add(key)
             if updates:
                 for key, value in updates.items():
 
@@ -505,7 +505,7 @@ def _create_table(
         try:
             response.raise_for_status()
         except HTTPError as exc:
-            _handle_non_200_response(exc, {409: TableAlreadyExistsError})
+            _handle_non_200_response(exc, {409: TableAlreadyExistsError, 404: NoSuchNamespaceError})
         return TableResponse.model_validate_json(response.text)
 
     @retry(**_RETRY_ARGS)
 
@@ -2410,9 +2410,12 @@ def data_file_statistics_from_parquet_metadata(
                         continue
 
                     if field_id not in col_aggs:
-                        col_aggs[field_id] = StatsAggregator(
-                            stats_col.iceberg_type, statistics.physical_type, stats_col.mode.length
-                        )
+                        try:
+                            col_aggs[field_id] = StatsAggregator(
+                                stats_col.iceberg_type, statistics.physical_type, stats_col.mode.length
+                            )
+                        except ValueError as e:
+                            raise ValueError(f"{e} for column '{stats_col.column_name}'") from e
 
                     if isinstance(stats_col.iceberg_type, DecimalType) and statistics.physical_type != "FIXED_LEN_BYTE_ARRAY":
                         scale = stats_col.iceberg_type.scale
@@ -2728,9 +2731,11 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
 
     for partition, name in zip(spec.fields, partition_fields):
         source_field = schema.find_field(partition.source_id)
-        arrow_table = arrow_table.append_column(
-            name, partition.transform.pyarrow_transform(source_field.field_type)(arrow_table[source_field.name])
-        )
+        full_field_name = schema.find_column_name(partition.source_id)
+        if full_field_name is None:
+            raise ValueError(f"Could not find column name for field ID: {partition.source_id}")
+        field_array = _get_field_from_arrow_table(arrow_table, full_field_name)
+        arrow_table = arrow_table.append_column(name, partition.transform.pyarrow_transform(source_field.field_type)(field_array))
 
     unique_partition_fields = arrow_table.select(partition_fields).group_by(partition_fields).aggregate([])
 
@@ -2765,3 +2770,32 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
         )
 
     return table_partitions
+
+
+def _get_field_from_arrow_table(arrow_table: pa.Table, field_path: str) -> pa.Array:
+    """Get a field from an Arrow table, supporting both literal field names and nested field paths.
+
+    This function handles two cases:
+    1. Literal field names that may contain dots (e.g., "some.id")
+    2. Nested field paths using dot notation (e.g., "bar.baz" for nested access)
+
+    Args:
+        arrow_table: The Arrow table containing the field
+        field_path: Field name or dot-separated path
+
+    Returns:
+        The field as a PyArrow Array
+
+    Raises:
+        KeyError: If the field path cannot be resolved
+    """
+    # Try exact column name match (handles field names containing literal dots)
+    if field_path in arrow_table.column_names:
+        return arrow_table[field_path]
+
+    # If not found as exact name, treat as nested field path
+    path_parts = field_path.split(".")
+    # Get the struct column from the table (e.g., "bar" from "bar.baz")
+    field_array = arrow_table[path_parts[0]]
+    # Navigate into the struct using the remaining path parts
+    return pc.struct_field(field_array, path_parts[1:])
@@ -137,6 +137,7 @@
 from pyiceberg.utils.properties import property_as_bool
 
 if TYPE_CHECKING:
+    import bodo.pandas as bd
     import daft
     import pandas as pd
     import polars as pl
@@ -1485,6 +1486,16 @@ def to_daft(self) -> daft.DataFrame:
 
         return daft.read_iceberg(self)
 
+    def to_bodo(self) -> bd.DataFrame:
+        """Read a bodo DataFrame lazily from this Iceberg table.
+
+        Returns:
+            bd.DataFrame: Unmaterialized Bodo Dataframe created from the Iceberg table
+        """
+        import bodo.pandas as bd
+
+        return bd.read_iceberg_table(self)
+
     def to_polars(self) -> pl.LazyFrame:
         """Lazily read from this Apache Iceberg table.
 
@@ -1691,7 +1702,14 @@ def to_polars(self) -> pl.DataFrame: ...
 
     def update(self: S, **overrides: Any) -> S:
         """Create a copy of this table scan with updated fields."""
-        return type(self)(**{**self.__dict__, **overrides})
+        from inspect import signature
+
+        # Extract those attributes that are constructor parameters. We don't use self.__dict__ as the kwargs to the
+        # constructors because it may contain additional attributes that are not part of the constructor signature.
+        params = signature(type(self).__init__).parameters.keys() - {"self"}  # Skip "self" parameter
+        kwargs = {param: getattr(self, param) for param in params}  # Assume parameters are attributes
+
+        return type(self)(**{**kwargs, **overrides})
 
     def use_ref(self: S, name: str) -> S:
         if self.snapshot_id:
 
@@ -58,6 +58,7 @@
 TOTAL_FILE_SIZE = "total-files-size"
 CHANGED_PARTITION_COUNT_PROP = "changed-partition-count"
 CHANGED_PARTITION_PREFIX = "partitions."
+PARTITION_SUMMARY_PROP = "partition-summaries-included"
 OPERATION = "operation"
 
 INITIAL_SEQUENCE_NUMBER = 0
@@ -306,6 +307,8 @@ def build(self) -> Dict[str, str]:
         changed_partitions_size = len(self.partition_metrics)
         set_when_positive(properties, changed_partitions_size, CHANGED_PARTITION_COUNT_PROP)
         if changed_partitions_size <= self.max_changed_partitions_for_summaries:
+            if changed_partitions_size > 0:
+                properties[PARTITION_SUMMARY_PROP] = "true"
             for partition_path, update_metrics_partition in self.partition_metrics.items():
                 if (summary := self._partition_summary(update_metrics_partition)) and len(summary) != 0:
                     properties[CHANGED_PARTITION_PREFIX + partition_path] = summary
 
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from typing import Dict, List, Literal, Optional
+from typing import Dict, List, Literal, Optional, Union
 
 from pydantic import Field
 
@@ -48,7 +48,7 @@ class PartitionStatisticsFile(StatisticsCommonFields):
 
 
 def filter_statistics_by_snapshot_id(
-    statistics: List[StatisticsFile],
+    statistics: List[Union[StatisticsFile, PartitionStatisticsFile]],
     reject_snapshot_id: int,
-) -> List[StatisticsFile]:
+) -> List[Union[StatisticsFile, PartitionStatisticsFile]]:
     return [stat for stat in statistics if stat.snapshot_id != reject_snapshot_id]