Skip to content

Commit 35f6f33

Browse files
authored
Fix the Avro tests (#552)
When writing V1, the sequence-numbers should be None. For V2 they will be written and read into the original value.
1 parent f1c2ef2 commit 35f6f33

File tree

2 files changed

+36
-7
lines changed

2 files changed

+36
-7
lines changed

pyiceberg/utils/schema_conversion.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,9 @@ def field(self, field: NestedField, field_result: AvroType) -> AvroType:
527527
"type": field_result if field.required else ["null", field_result],
528528
}
529529

530-
if field.optional:
530+
if field.write_default is not None:
531+
result["default"] = field.write_default # type: ignore
532+
elif field.optional:
531533
result["default"] = None
532534

533535
if field.doc is not None:

tests/avro/test_file.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717
import inspect
18+
from copy import copy
1819
from datetime import date, datetime, time
1920
from enum import Enum
2021
from tempfile import TemporaryDirectory
@@ -111,7 +112,7 @@ def todict(obj: Any) -> Any:
111112
return obj.value
112113
elif hasattr(obj, "__iter__") and not isinstance(obj, str) and not isinstance(obj, bytes):
113114
return [todict(v) for v in obj]
114-
elif hasattr(obj, "__dict__"):
115+
elif isinstance(obj, Record):
115116
return {key: todict(value) for key, value in inspect.getmembers(obj) if not callable(value) and not key.startswith("_")}
116117
else:
117118
return obj
@@ -258,8 +259,6 @@ def test_write_manifest_entry_with_fastavro_read_with_iceberg(format_version: in
258259
sort_order_id=4,
259260
spec_id=3,
260261
)
261-
if format_version == 1:
262-
data_file.block_size_in_bytes = DEFAULT_BLOCK_SIZE
263262

264263
entry = ManifestEntry(
265264
status=ManifestEntryStatus.ADDED,
@@ -277,16 +276,44 @@ def test_write_manifest_entry_with_fastavro_read_with_iceberg(format_version: in
277276
with open(tmp_avro_file, "wb") as out:
278277
writer(out, schema, [todict(entry)])
279278

279+
# Read as V2
280280
with avro.AvroFile[ManifestEntry](
281-
PyArrowFileIO().new_input(tmp_avro_file),
282-
MANIFEST_ENTRY_SCHEMAS[format_version],
283-
{-1: ManifestEntry, 2: DataFile},
281+
input_file=PyArrowFileIO().new_input(tmp_avro_file),
282+
read_schema=MANIFEST_ENTRY_SCHEMAS[2],
283+
read_types={-1: ManifestEntry, 2: DataFile},
284284
) as avro_reader:
285285
it = iter(avro_reader)
286286
avro_entry = next(it)
287287

288288
assert entry == avro_entry
289289

290+
# Read as the original version
291+
with avro.AvroFile[ManifestEntry](
292+
input_file=PyArrowFileIO().new_input(tmp_avro_file),
293+
read_schema=MANIFEST_ENTRY_SCHEMAS[format_version],
294+
read_types={-1: ManifestEntry, 2: DataFile},
295+
) as avro_reader:
296+
it = iter(avro_reader)
297+
avro_entry = next(it)
298+
299+
if format_version == 1:
300+
v1_datafile = copy(data_file)
301+
# Not part of V1
302+
v1_datafile.equality_ids = None
303+
304+
assert avro_entry == ManifestEntry(
305+
status=ManifestEntryStatus.ADDED,
306+
snapshot_id=8638475580105682862,
307+
# Not part of v1
308+
data_sequence_number=None,
309+
file_sequence_number=None,
310+
data_file=v1_datafile,
311+
)
312+
elif format_version == 2:
313+
assert entry == avro_entry
314+
else:
315+
raise ValueError(f"Unsupported version: {format_version}")
316+
290317

291318
@pytest.mark.parametrize("is_required", [True, False])
292319
def test_all_primitive_types(is_required: bool) -> None:

0 commit comments

Comments
 (0)