Skip to content

Commit 1723819

Browse files
committed
WIP
1 parent a5e988a commit 1723819

File tree

8 files changed

+195
-112
lines changed

8 files changed

+195
-112
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import logging
3232
import os
3333
import re
34+
import uuid
3435
from abc import ABC, abstractmethod
3536
from concurrent.futures import Future
3637
from copy import copy
@@ -126,7 +127,6 @@
126127
visit,
127128
visit_with_partner,
128129
)
129-
from pyiceberg.table import PropertyUtil, TableProperties, WriteTask
130130
from pyiceberg.table.metadata import TableMetadata
131131
from pyiceberg.table.name_mapping import NameMapping
132132
from pyiceberg.transforms import TruncateTransform
@@ -159,7 +159,7 @@
159159
from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string
160160

161161
if TYPE_CHECKING:
162-
from pyiceberg.table import FileScanTask
162+
from pyiceberg.table import FileScanTask, WriteTask
163163

164164
logger = logging.getLogger(__name__)
165165

@@ -1443,6 +1443,8 @@ class PyArrowStatisticsCollector(PreOrderSchemaVisitor[List[StatisticsCollector]
14431443
_default_mode: str
14441444

14451445
def __init__(self, schema: Schema, properties: Dict[str, str]):
1446+
from pyiceberg.table import TableProperties
1447+
14461448
self._schema = schema
14471449
self._properties = properties
14481450
self._default_mode = self._properties.get(
@@ -1478,6 +1480,8 @@ def map(
14781480
return k + v
14791481

14801482
def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]:
1483+
from pyiceberg.table import TableProperties
1484+
14811485
column_name = self._schema.find_column_name(self._field_id)
14821486
if column_name is None:
14831487
return []
@@ -1774,7 +1778,9 @@ def data_file_statistics_from_parquet_metadata(
17741778
)
17751779

17761780

1777-
def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteTask]) -> Iterator[DataFile]:
1781+
def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterable["WriteTask"]) -> Iterator[DataFile]:
1782+
from pyiceberg.table import PropertyUtil, TableProperties
1783+
17781784
schema = table_metadata.schema()
17791785
arrow_file_schema = schema.as_arrow()
17801786
parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties)
@@ -1875,6 +1881,8 @@ def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_
18751881

18761882

18771883
def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]:
1884+
from pyiceberg.table import PropertyUtil, TableProperties
1885+
18781886
for key_pattern in [
18791887
TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES,
18801888
TableProperties.PARQUET_PAGE_ROW_LIMIT,
@@ -1912,3 +1920,55 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]:
19121920
default=TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT,
19131921
),
19141922
}
1923+
1924+
1925+
def _dataframe_to_data_files(
1926+
table_metadata: TableMetadata,
1927+
df: pa.Table,
1928+
io: FileIO,
1929+
write_uuid: Optional[uuid.UUID] = None,
1930+
counter: Optional[itertools.count[int]] = None,
1931+
) -> Iterable[DataFile]:
1932+
"""Convert a PyArrow table into a DataFile.
1933+
1934+
Returns:
1935+
An iterable that supplies datafiles that represent the table.
1936+
"""
1937+
from pyiceberg.table import PropertyUtil, TableProperties, WriteTask
1938+
1939+
counter = counter or itertools.count(0)
1940+
write_uuid = write_uuid or uuid.uuid4()
1941+
target_file_size: int = PropertyUtil.property_as_int( # type: ignore # The property is set with non-None value.
1942+
properties=table_metadata.properties,
1943+
property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
1944+
default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT,
1945+
)
1946+
1947+
if table_metadata.spec().is_unpartitioned():
1948+
yield from write_file(
1949+
io=io,
1950+
table_metadata=table_metadata,
1951+
tasks=iter([
1952+
WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=table_metadata.schema())
1953+
for batches in bin_pack_arrow_table(df, target_file_size)
1954+
]),
1955+
)
1956+
else:
1957+
from pyiceberg.table import determine_partitions
1958+
1959+
partitions = determine_partitions(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df)
1960+
yield from write_file(
1961+
io=io,
1962+
table_metadata=table_metadata,
1963+
tasks=iter([
1964+
WriteTask(
1965+
write_uuid=write_uuid,
1966+
task_id=next(counter),
1967+
record_batches=batches,
1968+
partition_key=partition.partition_key,
1969+
schema=table_metadata.schema(),
1970+
)
1971+
for partition in partitions
1972+
for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size)
1973+
]),
1974+
)

0 commit comments

Comments
 (0)