Skip to content

Commit 58006e0

Browse files
committed
Add support for exporting metadata as parquet
1 parent 8efaf8b commit 58006e0

File tree

1 file changed

+41
-0
lines changed

1 file changed

+41
-0
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from itertools import batched
2+
from pathlib import Path
3+
from typing import Annotated
4+
5+
from annotated_types import Ge
6+
import djclick as click
7+
from isic_metadata.metadata import MetadataRow
8+
import pyarrow as pa
9+
import pyarrow.parquet as pq
10+
from pydantic_to_pyarrow import get_pyarrow_schema
11+
12+
from isic.core.models import Image
13+
14+
ROW_GROUP_SIZE = 10_000
15+
16+
17+
class ParquetMetadataRow(MetadataRow):
18+
age_approx: Annotated[int, Ge(0)] | None = None
19+
20+
21+
@click.command(help="Export the metadata for a set of images to a parquet file")
22+
@click.argument("parquet_path", type=str)
23+
@click.option("--public", is_flag=True, default=True)
24+
def export_metadata_parquet(parquet_path: str, *, public: bool = True):
25+
"""Export the metadata for a set of images to a parquet file."""
26+
output_path = Path(parquet_path)
27+
schema = get_pyarrow_schema(ParquetMetadataRow, exclude_fields=True)
28+
29+
for field in ["age", "marker_pen", "blurry", "hairy", "color_tint"]:
30+
schema = schema.remove(schema.get_field_index(field))
31+
32+
rows = (
33+
ParquetMetadataRow(**image.metadata)
34+
for image in Image.objects.filter(public=public).select_related("accession").iterator()
35+
)
36+
37+
with pq.ParquetWriter(output_path, schema) as writer:
38+
for batch in batched(rows, ROW_GROUP_SIZE):
39+
row_dicts = [row.model_dump(mode="python") for row in batch]
40+
table = pa.Table.from_pylist(row_dicts, schema=schema)
41+
writer.write_table(table)

0 commit comments

Comments
 (0)