|
30 | 30 | from pyiceberg.table import UpsertResult
|
31 | 31 | from pyiceberg.table.snapshots import Operation
|
32 | 32 | from pyiceberg.table.upsert_util import create_match_filter
|
33 |
| -from pyiceberg.types import IntegerType, NestedField, StringType |
| 33 | +from pyiceberg.types import IntegerType, NestedField, StringType, StructType |
34 | 34 | from tests.catalog.test_base import InMemoryCatalog, Table
|
35 | 35 |
|
36 | 36 |
|
@@ -511,6 +511,163 @@ def test_upsert_without_identifier_fields(catalog: Catalog) -> None:
|
511 | 511 | tbl.upsert(df)
|
512 | 512 |
|
513 | 513 |
|
| 514 | +def test_upsert_with_struct_field_as_non_join_key(catalog: Catalog) -> None: |
| 515 | + identifier = "default.test_upsert_struct_field_fails" |
| 516 | + _drop_table(catalog, identifier) |
| 517 | + |
| 518 | + schema = Schema( |
| 519 | + NestedField(1, "id", IntegerType(), required=True), |
| 520 | + NestedField( |
| 521 | + 2, |
| 522 | + "nested_type", |
| 523 | + StructType( |
| 524 | + NestedField(3, "sub1", StringType(), required=True), |
| 525 | + NestedField(4, "sub2", StringType(), required=True), |
| 526 | + ), |
| 527 | + required=False, |
| 528 | + ), |
| 529 | + identifier_field_ids=[1], |
| 530 | + ) |
| 531 | + |
| 532 | + tbl = catalog.create_table(identifier, schema=schema) |
| 533 | + |
| 534 | + arrow_schema = pa.schema( |
| 535 | + [ |
| 536 | + pa.field("id", pa.int32(), nullable=False), |
| 537 | + pa.field( |
| 538 | + "nested_type", |
| 539 | + pa.struct( |
| 540 | + [ |
| 541 | + pa.field("sub1", pa.large_string(), nullable=False), |
| 542 | + pa.field("sub2", pa.large_string(), nullable=False), |
| 543 | + ] |
| 544 | + ), |
| 545 | + nullable=True, |
| 546 | + ), |
| 547 | + ] |
| 548 | + ) |
| 549 | + |
| 550 | + initial_data = pa.Table.from_pylist( |
| 551 | + [ |
| 552 | + { |
| 553 | + "id": 1, |
| 554 | + "nested_type": {"sub1": "bla1", "sub2": "bla"}, |
| 555 | + } |
| 556 | + ], |
| 557 | + schema=arrow_schema, |
| 558 | + ) |
| 559 | + tbl.append(initial_data) |
| 560 | + |
| 561 | + update_data = pa.Table.from_pylist( |
| 562 | + [ |
| 563 | + { |
| 564 | + "id": 2, |
| 565 | + "nested_type": {"sub1": "bla1", "sub2": "bla"}, |
| 566 | + }, |
| 567 | + { |
| 568 | + "id": 1, |
| 569 | + "nested_type": {"sub1": "bla1", "sub2": "bla2"}, |
| 570 | + }, |
| 571 | + ], |
| 572 | + schema=arrow_schema, |
| 573 | + ) |
| 574 | + |
| 575 | + res = tbl.upsert(update_data, join_cols=["id"]) |
| 576 | + |
| 577 | + expected_updated = 1 |
| 578 | + expected_inserted = 1 |
| 579 | + |
| 580 | + assert_upsert_result(res, expected_updated, expected_inserted) |
| 581 | + |
| 582 | + update_data = pa.Table.from_pylist( |
| 583 | + [ |
| 584 | + { |
| 585 | + "id": 2, |
| 586 | + "nested_type": {"sub1": "bla1", "sub2": "bla"}, |
| 587 | + }, |
| 588 | + { |
| 589 | + "id": 1, |
| 590 | + "nested_type": {"sub1": "bla1", "sub2": "bla2"}, |
| 591 | + }, |
| 592 | + ], |
| 593 | + schema=arrow_schema, |
| 594 | + ) |
| 595 | + |
| 596 | + res = tbl.upsert(update_data, join_cols=["id"]) |
| 597 | + |
| 598 | + expected_updated = 0 |
| 599 | + expected_inserted = 0 |
| 600 | + |
| 601 | + assert_upsert_result(res, expected_updated, expected_inserted) |
| 602 | + |
| 603 | + |
| 604 | +def test_upsert_with_struct_field_as_join_key(catalog: Catalog) -> None: |
| 605 | + identifier = "default.test_upsert_with_struct_field_as_join_key" |
| 606 | + _drop_table(catalog, identifier) |
| 607 | + |
| 608 | + schema = Schema( |
| 609 | + NestedField(1, "id", IntegerType(), required=True), |
| 610 | + NestedField( |
| 611 | + 2, |
| 612 | + "nested_type", |
| 613 | + StructType( |
| 614 | + NestedField(3, "sub1", StringType(), required=True), |
| 615 | + NestedField(4, "sub2", StringType(), required=True), |
| 616 | + ), |
| 617 | + required=False, |
| 618 | + ), |
| 619 | + identifier_field_ids=[1], |
| 620 | + ) |
| 621 | + |
| 622 | + tbl = catalog.create_table(identifier, schema=schema) |
| 623 | + |
| 624 | + arrow_schema = pa.schema( |
| 625 | + [ |
| 626 | + pa.field("id", pa.int32(), nullable=False), |
| 627 | + pa.field( |
| 628 | + "nested_type", |
| 629 | + pa.struct( |
| 630 | + [ |
| 631 | + pa.field("sub1", pa.large_string(), nullable=False), |
| 632 | + pa.field("sub2", pa.large_string(), nullable=False), |
| 633 | + ] |
| 634 | + ), |
| 635 | + nullable=True, |
| 636 | + ), |
| 637 | + ] |
| 638 | + ) |
| 639 | + |
| 640 | + initial_data = pa.Table.from_pylist( |
| 641 | + [ |
| 642 | + { |
| 643 | + "id": 1, |
| 644 | + "nested_type": {"sub1": "bla1", "sub2": "bla"}, |
| 645 | + } |
| 646 | + ], |
| 647 | + schema=arrow_schema, |
| 648 | + ) |
| 649 | + tbl.append(initial_data) |
| 650 | + |
| 651 | + update_data = pa.Table.from_pylist( |
| 652 | + [ |
| 653 | + { |
| 654 | + "id": 2, |
| 655 | + "nested_type": {"sub1": "bla1", "sub2": "bla"}, |
| 656 | + }, |
| 657 | + { |
| 658 | + "id": 1, |
| 659 | + "nested_type": {"sub1": "bla1", "sub2": "bla"}, |
| 660 | + }, |
| 661 | + ], |
| 662 | + schema=arrow_schema, |
| 663 | + ) |
| 664 | + |
| 665 | + with pytest.raises( |
| 666 | + pa.lib.ArrowNotImplementedError, match="Keys of type struct<sub1: large_string not null, sub2: large_string not null>" |
| 667 | + ): |
| 668 | + _ = tbl.upsert(update_data, join_cols=["nested_type"]) |
| 669 | + |
| 670 | + |
514 | 671 | def test_upsert_with_nulls(catalog: Catalog) -> None:
|
515 | 672 | identifier = "default.test_upsert_with_nulls"
|
516 | 673 | _drop_table(catalog, identifier)
|
|
0 commit comments