Commit b95e792
authored
Use a join for upsert deduplication (apache#1685)
This changes the deduplication logic to use join to duplicate the rows.
While the original design wasn't wrong, it is more efficient to push
things down into PyArrow to have better multi-threading and no GIL.
I did a small benchmark:
```python
import time
import pyarrow as pa
from pyiceberg.catalog import Catalog
from pyiceberg.exceptions import NoSuchTableError
from pyiceberg.schema import Schema
from pyiceberg.types import NestedField, StringType, IntegerType
def _drop_table(catalog: Catalog, identifier: str) -> None:
try:
catalog.drop_table(identifier)
except NoSuchTableError:
pass
def test_vo(session_catalog: Catalog):
catalog = session_catalog
identifier = "default.test_upsert_benchmark"
_drop_table(catalog, identifier)
schema = Schema(
NestedField(1, "idx", IntegerType(), required=True),
NestedField(2, "number", IntegerType(), required=True),
# Mark City as the identifier field, also known as the primary-key
identifier_field_ids=[1],
)
tbl = catalog.create_table(identifier, schema=schema)
arrow_schema = pa.schema(
[
pa.field("idx", pa.int32(), nullable=False),
pa.field("number", pa.int32(), nullable=False),
]
)
# Write some data
df = pa.Table.from_pylist(
[
{"idx": idx, "number": idx}
for idx in range(1, 100000)
],
schema=arrow_schema,
)
tbl.append(df)
df_upsert = pa.Table.from_pylist(
# Overlap
[
{"idx": idx, "number": idx}
for idx in range(80000, 90000)
]+
# Update
[
{"idx": idx, "number": idx + 1}
for idx in range(90000, 100000)
]
# Insert
+ [
{"idx": idx, "number": idx}
for idx in range(100000, 110000)],
schema=arrow_schema,
)
start = time.time()
tbl.upsert(df_upsert)
stop = time.time()
print(f"Took {stop-start} seconds")
```
And the result was:
```
Took 2.0412521362304688 seconds on the fd-join branch
Took 3.5236432552337646 seconds on lastest main
```1 parent 68a08b1 commit b95e792
File tree
3 files changed
+67
-39
lines changed- pyiceberg/table
- tests/table
3 files changed
+67
-39
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1170 | 1170 | | |
1171 | 1171 | | |
1172 | 1172 | | |
| 1173 | + | |
| 1174 | + | |
| 1175 | + | |
| 1176 | + | |
| 1177 | + | |
| 1178 | + | |
| 1179 | + | |
1173 | 1180 | | |
1174 | 1181 | | |
1175 | 1182 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
59 | 59 | | |
60 | 60 | | |
61 | 61 | | |
62 | | - | |
63 | | - | |
| 62 | + | |
| 63 | + | |
64 | 64 | | |
65 | 65 | | |
66 | 66 | | |
| 67 | + | |
67 | 68 | | |
68 | | - | |
| 69 | + | |
| 70 | + | |
69 | 71 | | |
70 | 72 | | |
71 | 73 | | |
72 | 74 | | |
73 | 75 | | |
74 | | - | |
75 | | - | |
76 | | - | |
77 | | - | |
78 | | - | |
79 | | - | |
80 | | - | |
81 | | - | |
82 | | - | |
83 | | - | |
84 | | - | |
85 | | - | |
86 | | - | |
87 | | - | |
88 | | - | |
89 | | - | |
90 | | - | |
91 | | - | |
92 | | - | |
93 | | - | |
94 | | - | |
95 | | - | |
96 | | - | |
97 | | - | |
98 | | - | |
99 | | - | |
100 | | - | |
101 | | - | |
102 | | - | |
103 | | - | |
104 | | - | |
105 | | - | |
106 | | - | |
107 | | - | |
108 | | - | |
109 | | - | |
| 76 | + | |
| 77 | + | |
| 78 | + | |
| 79 | + | |
| 80 | + | |
| 81 | + | |
| 82 | + | |
| 83 | + | |
| 84 | + | |
| 85 | + | |
| 86 | + | |
| 87 | + | |
| 88 | + | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
427 | 427 | | |
428 | 428 | | |
429 | 429 | | |
| 430 | + | |
| 431 | + | |
| 432 | + | |
| 433 | + | |
| 434 | + | |
| 435 | + | |
| 436 | + | |
| 437 | + | |
| 438 | + | |
| 439 | + | |
| 440 | + | |
| 441 | + | |
| 442 | + | |
| 443 | + | |
| 444 | + | |
| 445 | + | |
| 446 | + | |
| 447 | + | |
| 448 | + | |
| 449 | + | |
| 450 | + | |
| 451 | + | |
| 452 | + | |
| 453 | + | |
| 454 | + | |
| 455 | + | |
| 456 | + | |
| 457 | + | |
| 458 | + | |
| 459 | + | |
| 460 | + | |
| 461 | + | |
| 462 | + | |
| 463 | + | |
| 464 | + | |
| 465 | + | |
| 466 | + | |
| 467 | + | |
| 468 | + | |
| 469 | + | |
| 470 | + | |
| 471 | + | |
430 | 472 | | |
431 | 473 | | |
432 | 474 | | |
| |||
0 commit comments