47
47
from typing_extensions import Annotated
48
48
49
49
import pyiceberg .expressions .parser as parser
50
- import pyiceberg .expressions .visitors as visitors
51
50
from pyiceberg .exceptions import CommitFailedException , ResolveError , ValidationError
52
51
from pyiceberg .expressions import (
53
52
AlwaysFalse ,
58
57
Or ,
59
58
Reference ,
60
59
)
60
+ from pyiceberg .expressions .visitors import (
61
+ _InclusiveMetricsEvaluator ,
62
+ expression_evaluator ,
63
+ inclusive_projection ,
64
+ manifest_evaluator ,
65
+ )
61
66
from pyiceberg .io import FileIO , load_file_io
62
67
from pyiceberg .manifest import (
63
68
POSITIONAL_DELETE_SCHEMA ,
@@ -217,6 +222,9 @@ class TableProperties:
217
222
218
223
PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX = "write.parquet.bloom-filter-enabled.column"
219
224
225
+ WRITE_TARGET_FILE_SIZE_BYTES = "write.target-file-size-bytes"
226
+ WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT = 512 * 1024 * 1024 # 512 MB
227
+
220
228
DEFAULT_WRITE_METRICS_MODE = "write.metadata.metrics.default"
221
229
DEFAULT_WRITE_METRICS_MODE_DEFAULT = "truncate(16)"
222
230
@@ -1130,8 +1138,9 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT)
1130
1138
1131
1139
_check_schema_compatible (self .schema (), other_schema = df .schema )
1132
1140
# cast if the two schemas are compatible but not equal
1133
- if self .schema ().as_arrow () != df .schema :
1134
- df = df .cast (self .schema ().as_arrow ())
1141
+ table_arrow_schema = self .schema ().as_arrow ()
1142
+ if table_arrow_schema != df .schema :
1143
+ df = df .cast (table_arrow_schema )
1135
1144
1136
1145
with self .transaction () as txn :
1137
1146
with txn .update_snapshot (snapshot_properties = snapshot_properties ).fast_append () as update_snapshot :
@@ -1171,8 +1180,9 @@ def overwrite(
1171
1180
1172
1181
_check_schema_compatible (self .schema (), other_schema = df .schema )
1173
1182
# cast if the two schemas are compatible but not equal
1174
- if self .schema ().as_arrow () != df .schema :
1175
- df = df .cast (self .schema ().as_arrow ())
1183
+ table_arrow_schema = self .schema ().as_arrow ()
1184
+ if table_arrow_schema != df .schema :
1185
+ df = df .cast (table_arrow_schema )
1176
1186
1177
1187
with self .transaction () as txn :
1178
1188
with txn .update_snapshot (snapshot_properties = snapshot_properties ).overwrite () as update_snapshot :
@@ -1442,9 +1452,7 @@ def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_ent
1442
1452
relevant_entries = positional_delete_entries [positional_delete_entries .bisect_right (data_entry ) :]
1443
1453
1444
1454
if len (relevant_entries ) > 0 :
1445
- evaluator = visitors ._InclusiveMetricsEvaluator (
1446
- POSITIONAL_DELETE_SCHEMA , EqualTo ("file_path" , data_entry .data_file .file_path )
1447
- )
1455
+ evaluator = _InclusiveMetricsEvaluator (POSITIONAL_DELETE_SCHEMA , EqualTo ("file_path" , data_entry .data_file .file_path ))
1448
1456
return {
1449
1457
positional_delete_entry .data_file
1450
1458
for positional_delete_entry in relevant_entries
@@ -1468,7 +1476,7 @@ def __init__(
1468
1476
super ().__init__ (table , row_filter , selected_fields , case_sensitive , snapshot_id , options , limit )
1469
1477
1470
1478
def _build_partition_projection (self , spec_id : int ) -> BooleanExpression :
1471
- project = visitors . inclusive_projection (self .table .schema (), self .table .specs ()[spec_id ])
1479
+ project = inclusive_projection (self .table .schema (), self .table .specs ()[spec_id ])
1472
1480
return project (self .row_filter )
1473
1481
1474
1482
@cached_property
@@ -1477,7 +1485,7 @@ def partition_filters(self) -> KeyDefaultDict[int, BooleanExpression]:
1477
1485
1478
1486
def _build_manifest_evaluator (self , spec_id : int ) -> Callable [[ManifestFile ], bool ]:
1479
1487
spec = self .table .specs ()[spec_id ]
1480
- return visitors . manifest_evaluator (spec , self .table .schema (), self .partition_filters [spec_id ], self .case_sensitive )
1488
+ return manifest_evaluator (spec , self .table .schema (), self .partition_filters [spec_id ], self .case_sensitive )
1481
1489
1482
1490
def _build_partition_evaluator (self , spec_id : int ) -> Callable [[DataFile ], bool ]:
1483
1491
spec = self .table .specs ()[spec_id ]
@@ -1488,9 +1496,7 @@ def _build_partition_evaluator(self, spec_id: int) -> Callable[[DataFile], bool]
1488
1496
# The lambda created here is run in multiple threads.
1489
1497
# So we avoid creating _EvaluatorExpression methods bound to a single
1490
1498
# shared instance across multiple threads.
1491
- return lambda data_file : visitors .expression_evaluator (partition_schema , partition_expr , self .case_sensitive )(
1492
- data_file .partition
1493
- )
1499
+ return lambda data_file : expression_evaluator (partition_schema , partition_expr , self .case_sensitive )(data_file .partition )
1494
1500
1495
1501
def _check_sequence_number (self , min_data_sequence_number : int , manifest : ManifestFile ) -> bool :
1496
1502
"""Ensure that no manifests are loaded that contain deletes that are older than the data.
@@ -1535,7 +1541,7 @@ def plan_files(self) -> Iterable[FileScanTask]:
1535
1541
# this filter depends on the partition spec used to write the manifest file
1536
1542
1537
1543
partition_evaluators : Dict [int , Callable [[DataFile ], bool ]] = KeyDefaultDict (self ._build_partition_evaluator )
1538
- metrics_evaluator = visitors . _InclusiveMetricsEvaluator (
1544
+ metrics_evaluator = _InclusiveMetricsEvaluator (
1539
1545
self .table .schema (), self .row_filter , self .case_sensitive , self .options .get ("include_empty_files" ) == "true"
1540
1546
).eval
1541
1547
@@ -2488,7 +2494,7 @@ def _add_and_move_fields(
2488
2494
class WriteTask :
2489
2495
write_uuid : uuid .UUID
2490
2496
task_id : int
2491
- df : pa .Table
2497
+ record_batches : List [ pa .RecordBatch ]
2492
2498
sort_order_id : Optional [int ] = None
2493
2499
2494
2500
# Later to be extended with partition information
@@ -2523,17 +2529,27 @@ def _dataframe_to_data_files(
2523
2529
Returns:
2524
2530
An iterable that supplies datafiles that represent the table.
2525
2531
"""
2526
- from pyiceberg .io .pyarrow import write_file
2532
+ from pyiceberg .io .pyarrow import bin_pack_arrow_table , write_file
2527
2533
2528
2534
if len ([spec for spec in table_metadata .partition_specs if spec .spec_id != 0 ]) > 0 :
2529
2535
raise ValueError ("Cannot write to partitioned tables" )
2530
2536
2531
2537
counter = itertools .count (0 )
2532
2538
write_uuid = write_uuid or uuid .uuid4 ()
2533
2539
2540
+ target_file_size = PropertyUtil .property_as_int (
2541
+ properties = table_metadata .properties ,
2542
+ property_name = TableProperties .WRITE_TARGET_FILE_SIZE_BYTES ,
2543
+ default = TableProperties .WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT ,
2544
+ )
2545
+
2534
2546
# This is an iter, so we don't have to materialize everything every time
2535
2547
# This will be more relevant when we start doing partitioned writes
2536
- yield from write_file (io = io , table_metadata = table_metadata , tasks = iter ([WriteTask (write_uuid , next (counter ), df )]))
2548
+ yield from write_file (
2549
+ io = io ,
2550
+ table_metadata = table_metadata ,
2551
+ tasks = iter ([WriteTask (write_uuid , next (counter ), batches ) for batches in bin_pack_arrow_table (df , target_file_size )]), # type: ignore
2552
+ )
2537
2553
2538
2554
2539
2555
def _parquet_files_to_data_files (table_metadata : TableMetadata , file_paths : List [str ], io : FileIO ) -> Iterable [DataFile ]:
0 commit comments