37
37
from cachetools .keys import hashkey
38
38
from pydantic_core import to_json
39
39
40
+ from pyiceberg .avro .codecs import AVRO_CODEC_KEY , AvroCompressionCodec
40
41
from pyiceberg .avro .file import AvroFile , AvroOutputFile
41
42
from pyiceberg .conversions import to_bytes
42
43
from pyiceberg .exceptions import ValidationError
@@ -946,9 +947,16 @@ class ManifestWriter(ABC):
946
947
_deleted_rows : int
947
948
_min_sequence_number : Optional [int ]
948
949
_partitions : List [Record ]
949
- _reused_entry_wrapper : ManifestEntry
950
+ _compression : AvroCompressionCodec
950
951
951
- def __init__ (self , spec : PartitionSpec , schema : Schema , output_file : OutputFile , snapshot_id : int ) -> None :
952
+ def __init__ (
953
+ self ,
954
+ spec : PartitionSpec ,
955
+ schema : Schema ,
956
+ output_file : OutputFile ,
957
+ snapshot_id : int ,
958
+ avro_compression : AvroCompressionCodec ,
959
+ ) -> None :
952
960
self .closed = False
953
961
self ._spec = spec
954
962
self ._schema = schema
@@ -963,6 +971,7 @@ def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile,
963
971
self ._deleted_rows = 0
964
972
self ._min_sequence_number = None
965
973
self ._partitions = []
974
+ self ._compression = avro_compression
966
975
967
976
def __enter__ (self ) -> ManifestWriter :
968
977
"""Open the writer."""
@@ -998,6 +1007,7 @@ def _meta(self) -> Dict[str, str]:
998
1007
"partition-spec" : to_json (self ._spec .fields ).decode ("utf-8" ),
999
1008
"partition-spec-id" : str (self ._spec .spec_id ),
1000
1009
"format-version" : str (self .version ),
1010
+ AVRO_CODEC_KEY : self ._compression ,
1001
1011
}
1002
1012
1003
1013
def _with_partition (self , format_version : TableVersion ) -> Schema :
@@ -1109,13 +1119,15 @@ def existing(self, entry: ManifestEntry) -> ManifestWriter:
1109
1119
1110
1120
1111
1121
class ManifestWriterV1 (ManifestWriter ):
1112
- def __init__ (self , spec : PartitionSpec , schema : Schema , output_file : OutputFile , snapshot_id : int ):
1113
- super ().__init__ (
1114
- spec ,
1115
- schema ,
1116
- output_file ,
1117
- snapshot_id ,
1118
- )
1122
+ def __init__ (
1123
+ self ,
1124
+ spec : PartitionSpec ,
1125
+ schema : Schema ,
1126
+ output_file : OutputFile ,
1127
+ snapshot_id : int ,
1128
+ avro_compression : AvroCompressionCodec ,
1129
+ ):
1130
+ super ().__init__ (spec , schema , output_file , snapshot_id , avro_compression )
1119
1131
1120
1132
def content (self ) -> ManifestContent :
1121
1133
return ManifestContent .DATA
@@ -1129,8 +1141,15 @@ def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry:
1129
1141
1130
1142
1131
1143
class ManifestWriterV2 (ManifestWriter ):
1132
- def __init__ (self , spec : PartitionSpec , schema : Schema , output_file : OutputFile , snapshot_id : int ):
1133
- super ().__init__ (spec , schema , output_file , snapshot_id )
1144
+ def __init__ (
1145
+ self ,
1146
+ spec : PartitionSpec ,
1147
+ schema : Schema ,
1148
+ output_file : OutputFile ,
1149
+ snapshot_id : int ,
1150
+ avro_compression : AvroCompressionCodec ,
1151
+ ):
1152
+ super ().__init__ (spec , schema , output_file , snapshot_id , avro_compression )
1134
1153
1135
1154
def content (self ) -> ManifestContent :
1136
1155
return ManifestContent .DATA
@@ -1156,12 +1175,17 @@ def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry:
1156
1175
1157
1176
1158
1177
def write_manifest (
1159
- format_version : TableVersion , spec : PartitionSpec , schema : Schema , output_file : OutputFile , snapshot_id : int
1178
+ format_version : TableVersion ,
1179
+ spec : PartitionSpec ,
1180
+ schema : Schema ,
1181
+ output_file : OutputFile ,
1182
+ snapshot_id : int ,
1183
+ avro_compression : AvroCompressionCodec ,
1160
1184
) -> ManifestWriter :
1161
1185
if format_version == 1 :
1162
- return ManifestWriterV1 (spec , schema , output_file , snapshot_id )
1186
+ return ManifestWriterV1 (spec , schema , output_file , snapshot_id , avro_compression )
1163
1187
elif format_version == 2 :
1164
- return ManifestWriterV2 (spec , schema , output_file , snapshot_id )
1188
+ return ManifestWriterV2 (spec , schema , output_file , snapshot_id , avro_compression )
1165
1189
else :
1166
1190
raise ValueError (f"Cannot write manifest for table version: { format_version } " )
1167
1191
@@ -1211,14 +1235,21 @@ def add_manifests(self, manifest_files: List[ManifestFile]) -> ManifestListWrite
1211
1235
1212
1236
1213
1237
class ManifestListWriterV1 (ManifestListWriter ):
1214
- def __init__ (self , output_file : OutputFile , snapshot_id : int , parent_snapshot_id : Optional [int ]):
1238
+ def __init__ (
1239
+ self ,
1240
+ output_file : OutputFile ,
1241
+ snapshot_id : int ,
1242
+ parent_snapshot_id : Optional [int ],
1243
+ compression : AvroCompressionCodec ,
1244
+ ):
1215
1245
super ().__init__ (
1216
1246
format_version = 1 ,
1217
1247
output_file = output_file ,
1218
1248
meta = {
1219
1249
"snapshot-id" : str (snapshot_id ),
1220
1250
"parent-snapshot-id" : str (parent_snapshot_id ) if parent_snapshot_id is not None else "null" ,
1221
1251
"format-version" : "1" ,
1252
+ AVRO_CODEC_KEY : compression ,
1222
1253
},
1223
1254
)
1224
1255
@@ -1232,7 +1263,14 @@ class ManifestListWriterV2(ManifestListWriter):
1232
1263
_commit_snapshot_id : int
1233
1264
_sequence_number : int
1234
1265
1235
- def __init__ (self , output_file : OutputFile , snapshot_id : int , parent_snapshot_id : Optional [int ], sequence_number : int ):
1266
+ def __init__ (
1267
+ self ,
1268
+ output_file : OutputFile ,
1269
+ snapshot_id : int ,
1270
+ parent_snapshot_id : Optional [int ],
1271
+ sequence_number : int ,
1272
+ compression : AvroCompressionCodec ,
1273
+ ):
1236
1274
super ().__init__ (
1237
1275
format_version = 2 ,
1238
1276
output_file = output_file ,
@@ -1241,6 +1279,7 @@ def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id
1241
1279
"parent-snapshot-id" : str (parent_snapshot_id ) if parent_snapshot_id is not None else "null" ,
1242
1280
"sequence-number" : str (sequence_number ),
1243
1281
"format-version" : "2" ,
1282
+ AVRO_CODEC_KEY : compression ,
1244
1283
},
1245
1284
)
1246
1285
self ._commit_snapshot_id = snapshot_id
@@ -1275,12 +1314,13 @@ def write_manifest_list(
1275
1314
snapshot_id : int ,
1276
1315
parent_snapshot_id : Optional [int ],
1277
1316
sequence_number : Optional [int ],
1317
+ avro_compression : AvroCompressionCodec ,
1278
1318
) -> ManifestListWriter :
1279
1319
if format_version == 1 :
1280
- return ManifestListWriterV1 (output_file , snapshot_id , parent_snapshot_id )
1320
+ return ManifestListWriterV1 (output_file , snapshot_id , parent_snapshot_id , avro_compression )
1281
1321
elif format_version == 2 :
1282
1322
if sequence_number is None :
1283
1323
raise ValueError (f"Sequence-number is required for V2 tables: { sequence_number } " )
1284
- return ManifestListWriterV2 (output_file , snapshot_id , parent_snapshot_id , sequence_number )
1324
+ return ManifestListWriterV2 (output_file , snapshot_id , parent_snapshot_id , sequence_number , avro_compression )
1285
1325
else :
1286
1326
raise ValueError (f"Cannot write manifest list for table version: { format_version } " )
0 commit comments