Skip to content

Commit 14ee8da

Browse files
Fokkokevinjqliu
andauthored
Avro: Fix tests (#2265)
While working on #2004 I've noticed some small discrepancies that I think would be good to address in a separate PR. <!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change # Are these changes tested? # Are there any user-facing changes? <!-- In the case of user-facing changes, please add the changelog label. --> --------- Co-authored-by: Kevin Liu <[email protected]>
1 parent ccaa15c commit 14ee8da

File tree

2 files changed

+51
-30
lines changed

2 files changed

+51
-30
lines changed

tests/conftest.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
import boto3
4848
import pytest
4949
from moto import mock_aws
50+
from pydantic_core import to_json
5051

5152
from pyiceberg.catalog import Catalog, load_catalog
5253
from pyiceberg.catalog.noop import NoopCatalog
@@ -67,10 +68,12 @@
6768
)
6869
from pyiceberg.io.fsspec import FsspecFileIO
6970
from pyiceberg.manifest import DataFile, FileFormat
71+
from pyiceberg.partitioning import PartitionField, PartitionSpec
7072
from pyiceberg.schema import Accessor, Schema
7173
from pyiceberg.serializers import ToOutputFile
7274
from pyiceberg.table import FileScanTask, Table
7375
from pyiceberg.table.metadata import TableMetadataV1, TableMetadataV2
76+
from pyiceberg.transforms import DayTransform, IdentityTransform
7477
from pyiceberg.types import (
7578
BinaryType,
7679
BooleanType,
@@ -1255,8 +1258,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str:
12551258
{"key": 15, "value": 0},
12561259
],
12571260
"lower_bounds": [
1258-
{"key": 2, "value": b"2020-04-01 00:00"},
1259-
{"key": 3, "value": b"2020-04-01 00:12"},
1261+
{"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"},
1262+
{"key": 3, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"},
12601263
{"key": 7, "value": b"\x03\x00\x00\x00"},
12611264
{"key": 8, "value": b"\x01\x00\x00\x00"},
12621265
{"key": 10, "value": b"\xf6(\\\x8f\xc2\x05S\xc0"},
@@ -1270,8 +1273,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str:
12701273
{"key": 19, "value": b"\x00\x00\x00\x00\x00\x00\x04\xc0"},
12711274
],
12721275
"upper_bounds": [
1273-
{"key": 2, "value": b"2020-04-30 23:5:"},
1274-
{"key": 3, "value": b"2020-05-01 00:41"},
1276+
{"key": 2, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"},
1277+
{"key": 3, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"},
12751278
{"key": 7, "value": b"\t\x01\x00\x00"},
12761279
{"key": 8, "value": b"\t\x01\x00\x00"},
12771280
{"key": 10, "value": b"\xcd\xcc\xcc\xcc\xcc,_@"},
@@ -1376,8 +1379,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str:
13761379
],
13771380
"lower_bounds": [
13781381
{"key": 1, "value": b"\x01\x00\x00\x00"},
1379-
{"key": 2, "value": b"2020-04-01 00:00"},
1380-
{"key": 3, "value": b"2020-04-01 00:03"},
1382+
{"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"},
1383+
{"key": 3, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"},
13811384
{"key": 4, "value": b"\x00\x00\x00\x00"},
13821385
{"key": 5, "value": b"\x01\x00\x00\x00"},
13831386
{"key": 6, "value": b"N"},
@@ -1396,8 +1399,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str:
13961399
],
13971400
"upper_bounds": [
13981401
{"key": 1, "value": b"\x01\x00\x00\x00"},
1399-
{"key": 2, "value": b"2020-04-30 23:5:"},
1400-
{"key": 3, "value": b"2020-05-01 00:1:"},
1402+
{"key": 2, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"},
1403+
{"key": 3, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"},
14011404
{"key": 4, "value": b"\x06\x00\x00\x00"},
14021405
{"key": 5, "value": b"c\x00\x00\x00"},
14031406
{"key": 6, "value": b"Y"},
@@ -1858,15 +1861,40 @@ def simple_map() -> MapType:
18581861

18591862

18601863
@pytest.fixture(scope="session")
1861-
def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any]) -> Generator[str, None, None]:
1864+
def test_schema() -> Schema:
1865+
return Schema(
1866+
NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", TimestampType(), False)
1867+
)
1868+
1869+
1870+
@pytest.fixture(scope="session")
1871+
def test_partition_spec() -> Schema:
1872+
return PartitionSpec(
1873+
PartitionField(1, 1000, IdentityTransform(), "VendorID"),
1874+
PartitionField(2, 1001, DayTransform(), "tpep_pickup_day"),
1875+
)
1876+
1877+
1878+
@pytest.fixture(scope="session")
1879+
def generated_manifest_entry_file(
1880+
avro_schema_manifest_entry: Dict[str, Any], test_schema: Schema, test_partition_spec: PartitionSpec
1881+
) -> Generator[str, None, None]:
18621882
from fastavro import parse_schema, writer
18631883

18641884
parsed_schema = parse_schema(avro_schema_manifest_entry)
18651885

18661886
with TemporaryDirectory() as tmpdir:
18671887
tmp_avro_file = tmpdir + "/manifest.avro"
18681888
with open(tmp_avro_file, "wb") as out:
1869-
writer(out, parsed_schema, manifest_entry_records)
1889+
writer(
1890+
out,
1891+
parsed_schema,
1892+
manifest_entry_records,
1893+
metadata={
1894+
"schema": test_schema.model_dump_json(),
1895+
"partition-spec": to_json(test_partition_spec.fields).decode("utf-8"),
1896+
},
1897+
)
18701898
yield tmp_avro_file
18711899

18721900

tests/utils/test_manifest.py

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,9 @@
3838
write_manifest,
3939
write_manifest_list,
4040
)
41-
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
41+
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
4242
from pyiceberg.schema import Schema
4343
from pyiceberg.table.snapshots import Operation, Snapshot, Summary
44-
from pyiceberg.transforms import IdentityTransform
4544
from pyiceberg.typedef import Record, TableVersion
4645
from pyiceberg.types import IntegerType, NestedField
4746

@@ -154,8 +153,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None:
154153
}
155154
assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}
156155
assert data_file.lower_bounds == {
157-
2: b"2020-04-01 00:00",
158-
3: b"2020-04-01 00:12",
156+
2: b"\x01\x00\x00\x00\x00\x00\x00\x00",
157+
3: b"\x01\x00\x00\x00\x00\x00\x00\x00",
159158
7: b"\x03\x00\x00\x00",
160159
8: b"\x01\x00\x00\x00",
161160
10: b"\xf6(\\\x8f\xc2\x05S\xc0",
@@ -169,8 +168,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None:
169168
19: b"\x00\x00\x00\x00\x00\x00\x04\xc0",
170169
}
171170
assert data_file.upper_bounds == {
172-
2: b"2020-04-30 23:5:",
173-
3: b"2020-05-01 00:41",
171+
2: b"\x06\x00\x00\x00\x00\x00\x00\x00",
172+
3: b"\x06\x00\x00\x00\x00\x00\x00\x00",
174173
7: b"\t\x01\x00\x00",
175174
8: b"\t\x01\x00\x00",
176175
10: b"\xcd\xcc\xcc\xcc\xcc,_@",
@@ -363,6 +362,8 @@ def test_write_manifest(
363362
generated_manifest_file_file_v1: str,
364363
generated_manifest_file_file_v2: str,
365364
format_version: TableVersion,
365+
test_schema: Schema,
366+
test_partition_spec: PartitionSpec,
366367
compression: AvroCompressionCodec,
367368
) -> None:
368369
io = load_file_io()
@@ -376,20 +377,12 @@ def test_write_manifest(
376377
)
377378
demo_manifest_file = snapshot.manifests(io)[0]
378379
manifest_entries = demo_manifest_file.fetch_manifest_entry(io)
379-
test_schema = Schema(
380-
NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", IntegerType(), False)
381-
)
382-
test_spec = PartitionSpec(
383-
PartitionField(source_id=1, field_id=1, transform=IdentityTransform(), name="VendorID"),
384-
PartitionField(source_id=2, field_id=2, transform=IdentityTransform(), name="tpep_pickup_datetime"),
385-
spec_id=demo_manifest_file.partition_spec_id,
386-
)
387380
with TemporaryDirectory() as tmpdir:
388381
tmp_avro_file = tmpdir + "/test_write_manifest.avro"
389382
output = io.new_output(tmp_avro_file)
390383
with write_manifest(
391384
format_version=format_version,
392-
spec=test_spec,
385+
spec=test_partition_spec,
393386
schema=test_schema,
394387
output_file=output,
395388
snapshot_id=8744736658442914487,
@@ -404,7 +397,7 @@ def test_write_manifest(
404397

405398
expected_metadata = {
406399
"schema": test_schema.model_dump_json(),
407-
"partition-spec": """[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""",
400+
"partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_day"}]""",
408401
"partition-spec-id": str(demo_manifest_file.partition_spec_id),
409402
"format-version": str(format_version),
410403
}
@@ -497,8 +490,8 @@ def test_write_manifest(
497490
}
498491
assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}
499492
assert data_file.lower_bounds == {
500-
2: b"2020-04-01 00:00",
501-
3: b"2020-04-01 00:12",
493+
2: b"\x01\x00\x00\x00\x00\x00\x00\x00",
494+
3: b"\x01\x00\x00\x00\x00\x00\x00\x00",
502495
7: b"\x03\x00\x00\x00",
503496
8: b"\x01\x00\x00\x00",
504497
10: b"\xf6(\\\x8f\xc2\x05S\xc0",
@@ -512,8 +505,8 @@ def test_write_manifest(
512505
19: b"\x00\x00\x00\x00\x00\x00\x04\xc0",
513506
}
514507
assert data_file.upper_bounds == {
515-
2: b"2020-04-30 23:5:",
516-
3: b"2020-05-01 00:41",
508+
2: b"\x06\x00\x00\x00\x00\x00\x00\x00",
509+
3: b"\x06\x00\x00\x00\x00\x00\x00\x00",
517510
7: b"\t\x01\x00\x00",
518511
8: b"\t\x01\x00\x00",
519512
10: b"\xcd\xcc\xcc\xcc\xcc,_@",

0 commit comments

Comments
 (0)