|
31 | 31 | import logging
|
32 | 32 | import os
|
33 | 33 | import re
|
| 34 | +import uuid |
34 | 35 | from abc import ABC, abstractmethod
|
35 | 36 | from concurrent.futures import Future
|
36 | 37 | from copy import copy
|
|
126 | 127 | visit,
|
127 | 128 | visit_with_partner,
|
128 | 129 | )
|
129 |
| -from pyiceberg.table import PropertyUtil, TableProperties, WriteTask |
130 | 130 | from pyiceberg.table.metadata import TableMetadata
|
131 | 131 | from pyiceberg.table.name_mapping import NameMapping
|
132 | 132 | from pyiceberg.transforms import TruncateTransform
|
|
159 | 159 | from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string
|
160 | 160 |
|
161 | 161 | if TYPE_CHECKING:
|
162 |
| - from pyiceberg.table import FileScanTask |
| 162 | + from pyiceberg.table import FileScanTask, WriteTask |
163 | 163 |
|
164 | 164 | logger = logging.getLogger(__name__)
|
165 | 165 |
|
@@ -1563,6 +1563,8 @@ class PyArrowStatisticsCollector(PreOrderSchemaVisitor[List[StatisticsCollector]
|
1563 | 1563 | _default_mode: str
|
1564 | 1564 |
|
1565 | 1565 | def __init__(self, schema: Schema, properties: Dict[str, str]):
|
| 1566 | + from pyiceberg.table import TableProperties |
| 1567 | + |
1566 | 1568 | self._schema = schema
|
1567 | 1569 | self._properties = properties
|
1568 | 1570 | self._default_mode = self._properties.get(
|
@@ -1598,6 +1600,8 @@ def map(
|
1598 | 1600 | return k + v
|
1599 | 1601 |
|
1600 | 1602 | def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]:
|
| 1603 | + from pyiceberg.table import TableProperties |
| 1604 | + |
1601 | 1605 | column_name = self._schema.find_column_name(self._field_id)
|
1602 | 1606 | if column_name is None:
|
1603 | 1607 | return []
|
@@ -1895,6 +1899,8 @@ def data_file_statistics_from_parquet_metadata(
|
1895 | 1899 |
|
1896 | 1900 |
|
1897 | 1901 | def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteTask]) -> Iterator[DataFile]:
|
| 1902 | + from pyiceberg.table import PropertyUtil, TableProperties |
| 1903 | + |
1898 | 1904 | parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties)
|
1899 | 1905 | row_group_size = PropertyUtil.property_as_int(
|
1900 | 1906 | properties=table_metadata.properties,
|
@@ -2005,6 +2011,8 @@ def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_
|
2005 | 2011 |
|
2006 | 2012 |
|
2007 | 2013 | def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]:
|
| 2014 | + from pyiceberg.table import PropertyUtil, TableProperties |
| 2015 | + |
2008 | 2016 | for key_pattern in [
|
2009 | 2017 | TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES,
|
2010 | 2018 | TableProperties.PARQUET_PAGE_ROW_LIMIT,
|
@@ -2042,3 +2050,55 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]:
|
2042 | 2050 | default=TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT,
|
2043 | 2051 | ),
|
2044 | 2052 | }
|
| 2053 | + |
| 2054 | + |
| 2055 | +def _dataframe_to_data_files( |
| 2056 | + table_metadata: TableMetadata, |
| 2057 | + df: pa.Table, |
| 2058 | + io: FileIO, |
| 2059 | + write_uuid: Optional[uuid.UUID] = None, |
| 2060 | + counter: Optional[itertools.count[int]] = None, |
| 2061 | +) -> Iterable[DataFile]: |
| 2062 | + """Convert a PyArrow table into a DataFile. |
| 2063 | +
|
| 2064 | + Returns: |
| 2065 | + An iterable that supplies datafiles that represent the table. |
| 2066 | + """ |
| 2067 | + from pyiceberg.table import PropertyUtil, TableProperties, WriteTask |
| 2068 | + |
| 2069 | + counter = counter or itertools.count(0) |
| 2070 | + write_uuid = write_uuid or uuid.uuid4() |
| 2071 | + target_file_size: int = PropertyUtil.property_as_int( # type: ignore # The property is set with non-None value. |
| 2072 | + properties=table_metadata.properties, |
| 2073 | + property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, |
| 2074 | + default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, |
| 2075 | + ) |
| 2076 | + |
| 2077 | + if table_metadata.spec().is_unpartitioned(): |
| 2078 | + yield from write_file( |
| 2079 | + io=io, |
| 2080 | + table_metadata=table_metadata, |
| 2081 | + tasks=iter([ |
| 2082 | + WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=table_metadata.schema()) |
| 2083 | + for batches in bin_pack_arrow_table(df, target_file_size) |
| 2084 | + ]), |
| 2085 | + ) |
| 2086 | + else: |
| 2087 | + from pyiceberg.table import _determine_partitions |
| 2088 | + |
| 2089 | + partitions = _determine_partitions(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df) |
| 2090 | + yield from write_file( |
| 2091 | + io=io, |
| 2092 | + table_metadata=table_metadata, |
| 2093 | + tasks=iter([ |
| 2094 | + WriteTask( |
| 2095 | + write_uuid=write_uuid, |
| 2096 | + task_id=next(counter), |
| 2097 | + record_batches=batches, |
| 2098 | + partition_key=partition.partition_key, |
| 2099 | + schema=table_metadata.schema(), |
| 2100 | + ) |
| 2101 | + for partition in partitions |
| 2102 | + for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size) |
| 2103 | + ]), |
| 2104 | + ) |
0 commit comments