Skip to content
55 changes: 55 additions & 0 deletions dev/provision.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,3 +413,58 @@
)
spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str WRITE ORDERED BY id")
spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str VALUES 'a', 'c'")

spark.sql(
f"""
CREATE OR REPLACE TABLE {catalog_name}.default.test_incremental_read (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Same as in #533)

dt date,
number integer,
letter string
)
USING iceberg
TBLPROPERTIES (
'format-version'='2'
);
"""
)

spark.sql(
f"""
INSERT INTO {catalog_name}.default.test_incremental_read
VALUES (CAST('2022-03-01' AS date), 1, 'a')
"""
)

spark.sql(
f"""
INSERT INTO {catalog_name}.default.test_incremental_read
VALUES (CAST('2022-03-01' AS date), 2, 'b')
"""
)

spark.sql(
f"""
INSERT INTO {catalog_name}.default.test_incremental_read
VALUES (CAST('2022-03-02' AS date), 3, 'c'), (CAST('2022-03-02' AS date), 4, 'b')
"""
)

spark.sql(
f"""
DELETE FROM {catalog_name}.default.test_incremental_read
WHERE number = 2
"""
)

# https://github.com/apache/iceberg/issues/1092#issuecomment-638432848 / https://github.com/apache/iceberg/issues/3747#issuecomment-1145419407
# Don't do replace for Hive catalog as REPLACE TABLE requires certain Hive server configuration
Copy link
Contributor Author

@smaheshwar-pltr smaheshwar-pltr May 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably fixable 🤔 (but I also made use of this here)

if catalog_name != "hive":
# Replace to break snapshot lineage:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strictly speaking, I don't need this case because I can test broken lineage throwing just by inverting snapshot orders. But this feels like a realer use case to me. And changing the schema in this way also lets me test that the table's current schema is always used 😄

spark.sql(
f"""
REPLACE TABLE {catalog_name}.default.test_incremental_read
USING iceberg
TBLPROPERTIES ('format-version'='2')
AS SELECT number, letter FROM {catalog_name}.default.test_incremental_read
"""
)
8 changes: 8 additions & 0 deletions pyiceberg/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,14 @@ def fetch_manifest_entry(self, io: FileIO, discard_deleted: bool = True) -> List
if not discard_deleted or entry.status != ManifestEntryStatus.DELETED
]

def __eq__(self, other: Any) -> bool:
Copy link
Contributor Author

@smaheshwar-pltr smaheshwar-pltr May 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changes in this file are from #533.

To elaborate on why they're needed:

"""Return the equality of two instances of the ManifestFile class."""
return self.manifest_path == other.manifest_path if isinstance(other, ManifestFile) else False

def __hash__(self) -> int:
"""Return the hash of manifest_path."""
return hash(self.manifest_path)


@cached(cache=LRUCache(maxsize=128), key=lambda io, manifest_list: hashkey(manifest_list))
def _manifests(io: FileIO, manifest_list: str) -> Tuple[ManifestFile, ...]:
Expand Down
Loading