|
20 | 20 | import tempfile
|
21 | 21 | import uuid
|
22 | 22 | import warnings
|
23 |
| -from datetime import date |
| 23 | +from datetime import date, datetime, timezone |
24 | 24 | from typing import Any, List, Optional
|
25 | 25 | from unittest.mock import MagicMock, patch
|
26 | 26 | from uuid import uuid4
|
|
61 | 61 | from pyiceberg.io import S3_RETRY_STRATEGY_IMPL, InputStream, OutputStream, load_file_io
|
62 | 62 | from pyiceberg.io.pyarrow import (
|
63 | 63 | ICEBERG_SCHEMA,
|
| 64 | + PYARROW_PARQUET_FIELD_ID_KEY, |
64 | 65 | ArrowScan,
|
65 | 66 | PyArrowFile,
|
66 | 67 | PyArrowFileIO,
|
|
70 | 71 | _determine_partitions,
|
71 | 72 | _primitive_to_physical,
|
72 | 73 | _read_deletes,
|
| 74 | + _task_to_record_batches, |
73 | 75 | _to_requested_schema,
|
74 | 76 | bin_pack_arrow_table,
|
75 | 77 | compute_statistics_plan,
|
|
85 | 87 | from pyiceberg.table.metadata import TableMetadataV2
|
86 | 88 | from pyiceberg.table.name_mapping import create_mapping_from_schema
|
87 | 89 | from pyiceberg.transforms import HourTransform, IdentityTransform
|
88 |
| -from pyiceberg.typedef import UTF8, Properties, Record |
| 90 | +from pyiceberg.typedef import UTF8, Properties, Record, TableVersion |
89 | 91 | from pyiceberg.types import (
|
90 | 92 | BinaryType,
|
91 | 93 | BooleanType,
|
|
102 | 104 | PrimitiveType,
|
103 | 105 | StringType,
|
104 | 106 | StructType,
|
| 107 | + TimestampNanoType, |
105 | 108 | TimestampType,
|
106 | 109 | TimestamptzType,
|
107 | 110 | TimeType,
|
@@ -873,6 +876,18 @@ def _write_table_to_file(filepath: str, schema: pa.Schema, table: pa.Table) -> s
|
873 | 876 | return filepath
|
874 | 877 |
|
875 | 878 |
|
| 879 | +def _write_table_to_data_file(filepath: str, schema: pa.Schema, table: pa.Table) -> DataFile: |
| 880 | + filepath = _write_table_to_file(filepath, schema, table) |
| 881 | + return DataFile.from_args( |
| 882 | + content=DataFileContent.DATA, |
| 883 | + file_path=filepath, |
| 884 | + file_format=FileFormat.PARQUET, |
| 885 | + partition={}, |
| 886 | + record_count=len(table), |
| 887 | + file_size_in_bytes=22, # This is not relevant for now |
| 888 | + ) |
| 889 | + |
| 890 | + |
876 | 891 | @pytest.fixture
|
877 | 892 | def file_int(schema_int: Schema, tmpdir: str) -> str:
|
878 | 893 | pyarrow_schema = schema_to_pyarrow(schema_int, metadata={ICEBERG_SCHEMA: bytes(schema_int.model_dump_json(), UTF8)})
|
@@ -2411,8 +2426,6 @@ def test_partition_for_nested_field() -> None:
|
2411 | 2426 |
|
2412 | 2427 | spec = PartitionSpec(PartitionField(source_id=3, field_id=1000, transform=HourTransform(), name="ts"))
|
2413 | 2428 |
|
2414 |
| - from datetime import datetime |
2415 |
| - |
2416 | 2429 | t1 = datetime(2025, 7, 11, 9, 30, 0)
|
2417 | 2430 | t2 = datetime(2025, 7, 11, 10, 30, 0)
|
2418 | 2431 |
|
@@ -2551,8 +2564,6 @@ def test_initial_value() -> None:
|
2551 | 2564 |
|
2552 | 2565 |
|
2553 | 2566 | def test__to_requested_schema_timestamp_to_timestamptz_projection() -> None:
|
2554 |
| - from datetime import datetime, timezone |
2555 |
| - |
2556 | 2567 | # file is written with timestamp without timezone
|
2557 | 2568 | file_schema = Schema(NestedField(1, "ts_field", TimestampType(), required=False))
|
2558 | 2569 | batch = pa.record_batch(
|
@@ -2722,3 +2733,55 @@ def test_retry_strategy_not_found() -> None:
|
2722 | 2733 | io = PyArrowFileIO(properties={S3_RETRY_STRATEGY_IMPL: "pyiceberg.DoesNotExist"})
|
2723 | 2734 | with pytest.warns(UserWarning, match="Could not initialize S3 retry strategy: pyiceberg.DoesNotExist"):
|
2724 | 2735 | io.new_input("s3://bucket/path/to/file")
|
| 2736 | + |
| 2737 | + |
| 2738 | +@pytest.mark.parametrize("format_version", [1, 2, 3]) |
| 2739 | +def test_task_to_record_batches_nanos(format_version: TableVersion, tmpdir: str) -> None: |
| 2740 | + arrow_table = pa.table( |
| 2741 | + [ |
| 2742 | + pa.array( |
| 2743 | + [ |
| 2744 | + datetime(2025, 8, 14, 12, 0, 0), |
| 2745 | + datetime(2025, 8, 14, 13, 0, 0), |
| 2746 | + ], |
| 2747 | + type=pa.timestamp("ns"), |
| 2748 | + ) |
| 2749 | + ], |
| 2750 | + pa.schema((pa.field("ts_field", pa.timestamp("ns"), nullable=True, metadata={PYARROW_PARQUET_FIELD_ID_KEY: "1"}),)), |
| 2751 | + ) |
| 2752 | + |
| 2753 | + data_file = _write_table_to_data_file(f"{tmpdir}/test_task_to_record_batches_nanos.parquet", arrow_table.schema, arrow_table) |
| 2754 | + |
| 2755 | + if format_version <= 2: |
| 2756 | + table_schema = Schema(NestedField(1, "ts_field", TimestampType(), required=False)) |
| 2757 | + else: |
| 2758 | + table_schema = Schema(NestedField(1, "ts_field", TimestampNanoType(), required=False)) |
| 2759 | + |
| 2760 | + actual_result = list( |
| 2761 | + _task_to_record_batches( |
| 2762 | + PyArrowFileIO(), |
| 2763 | + FileScanTask(data_file), |
| 2764 | + bound_row_filter=AlwaysTrue(), |
| 2765 | + projected_schema=table_schema, |
| 2766 | + projected_field_ids={1}, |
| 2767 | + positional_deletes=None, |
| 2768 | + case_sensitive=True, |
| 2769 | + format_version=format_version, |
| 2770 | + ) |
| 2771 | + )[0] |
| 2772 | + |
| 2773 | + def _expected_batch(unit: str) -> pa.RecordBatch: |
| 2774 | + return pa.record_batch( |
| 2775 | + [ |
| 2776 | + pa.array( |
| 2777 | + [ |
| 2778 | + datetime(2025, 8, 14, 12, 0, 0), |
| 2779 | + datetime(2025, 8, 14, 13, 0, 0), |
| 2780 | + ], |
| 2781 | + type=pa.timestamp(unit), |
| 2782 | + ) |
| 2783 | + ], |
| 2784 | + names=["ts_field"], |
| 2785 | + ) |
| 2786 | + |
| 2787 | + assert _expected_batch("ns" if format_version > 2 else "us").equals(actual_result) |
0 commit comments