|
31 | 31 | import logging
|
32 | 32 | import os
|
33 | 33 | import re
|
| 34 | +import uuid |
34 | 35 | from abc import ABC, abstractmethod
|
35 | 36 | from concurrent.futures import Future
|
36 | 37 | from copy import copy
|
|
126 | 127 | visit,
|
127 | 128 | visit_with_partner,
|
128 | 129 | )
|
129 |
| -from pyiceberg.table import PropertyUtil, TableProperties, WriteTask |
130 | 130 | from pyiceberg.table.metadata import TableMetadata
|
131 | 131 | from pyiceberg.table.name_mapping import NameMapping
|
132 | 132 | from pyiceberg.transforms import TruncateTransform
|
|
159 | 159 | from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string
|
160 | 160 |
|
161 | 161 | if TYPE_CHECKING:
|
162 |
| - from pyiceberg.table import FileScanTask |
| 162 | + from pyiceberg.table import FileScanTask, WriteTask |
163 | 163 |
|
164 | 164 | logger = logging.getLogger(__name__)
|
165 | 165 |
|
@@ -1443,6 +1443,8 @@ class PyArrowStatisticsCollector(PreOrderSchemaVisitor[List[StatisticsCollector]
|
1443 | 1443 | _default_mode: str
|
1444 | 1444 |
|
1445 | 1445 | def __init__(self, schema: Schema, properties: Dict[str, str]):
|
| 1446 | + from pyiceberg.table import TableProperties |
| 1447 | + |
1446 | 1448 | self._schema = schema
|
1447 | 1449 | self._properties = properties
|
1448 | 1450 | self._default_mode = self._properties.get(
|
@@ -1478,6 +1480,8 @@ def map(
|
1478 | 1480 | return k + v
|
1479 | 1481 |
|
1480 | 1482 | def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]:
|
| 1483 | + from pyiceberg.table import TableProperties |
| 1484 | + |
1481 | 1485 | column_name = self._schema.find_column_name(self._field_id)
|
1482 | 1486 | if column_name is None:
|
1483 | 1487 | return []
|
@@ -1774,7 +1778,9 @@ def data_file_statistics_from_parquet_metadata(
|
1774 | 1778 | )
|
1775 | 1779 |
|
1776 | 1780 |
|
1777 |
| -def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteTask]) -> Iterator[DataFile]: |
| 1781 | +def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterable["WriteTask"]) -> Iterator[DataFile]: |
| 1782 | + from pyiceberg.table import PropertyUtil, TableProperties |
| 1783 | + |
1778 | 1784 | schema = table_metadata.schema()
|
1779 | 1785 | arrow_file_schema = schema.as_arrow()
|
1780 | 1786 | parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties)
|
@@ -1875,6 +1881,8 @@ def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_
|
1875 | 1881 |
|
1876 | 1882 |
|
1877 | 1883 | def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]:
|
| 1884 | + from pyiceberg.table import PropertyUtil, TableProperties |
| 1885 | + |
1878 | 1886 | for key_pattern in [
|
1879 | 1887 | TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES,
|
1880 | 1888 | TableProperties.PARQUET_PAGE_ROW_LIMIT,
|
@@ -1912,3 +1920,55 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]:
|
1912 | 1920 | default=TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT,
|
1913 | 1921 | ),
|
1914 | 1922 | }
|
| 1923 | + |
| 1924 | + |
| 1925 | +def _dataframe_to_data_files( |
| 1926 | + table_metadata: TableMetadata, |
| 1927 | + df: pa.Table, |
| 1928 | + io: FileIO, |
| 1929 | + write_uuid: Optional[uuid.UUID] = None, |
| 1930 | + counter: Optional[itertools.count[int]] = None, |
| 1931 | +) -> Iterable[DataFile]: |
| 1932 | + """Convert a PyArrow table into a DataFile. |
| 1933 | +
|
| 1934 | + Returns: |
| 1935 | + An iterable that supplies datafiles that represent the table. |
| 1936 | + """ |
| 1937 | + from pyiceberg.table import PropertyUtil, TableProperties, WriteTask |
| 1938 | + |
| 1939 | + counter = counter or itertools.count(0) |
| 1940 | + write_uuid = write_uuid or uuid.uuid4() |
| 1941 | + target_file_size: int = PropertyUtil.property_as_int( # type: ignore # The property is set with non-None value. |
| 1942 | + properties=table_metadata.properties, |
| 1943 | + property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, |
| 1944 | + default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, |
| 1945 | + ) |
| 1946 | + |
| 1947 | + if table_metadata.spec().is_unpartitioned(): |
| 1948 | + yield from write_file( |
| 1949 | + io=io, |
| 1950 | + table_metadata=table_metadata, |
| 1951 | + tasks=iter([ |
| 1952 | + WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=table_metadata.schema()) |
| 1953 | + for batches in bin_pack_arrow_table(df, target_file_size) |
| 1954 | + ]), |
| 1955 | + ) |
| 1956 | + else: |
| 1957 | + from pyiceberg.table import determine_partitions |
| 1958 | + |
| 1959 | + partitions = determine_partitions(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df) |
| 1960 | + yield from write_file( |
| 1961 | + io=io, |
| 1962 | + table_metadata=table_metadata, |
| 1963 | + tasks=iter([ |
| 1964 | + WriteTask( |
| 1965 | + write_uuid=write_uuid, |
| 1966 | + task_id=next(counter), |
| 1967 | + record_batches=batches, |
| 1968 | + partition_key=partition.partition_key, |
| 1969 | + schema=table_metadata.schema(), |
| 1970 | + ) |
| 1971 | + for partition in partitions |
| 1972 | + for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size) |
| 1973 | + ]), |
| 1974 | + ) |
0 commit comments